1 Introduction

Throughout the script, I use 'grey' for ribbons, and purple for lines.

2 Setup

2.1 Data and packages

Libraries:

# Core:

library(tidyverse)
library(brms)

# Multi-feature analyses:

library(FactoMineR)   # for MCA
library(factoextra)   # for MCA
library(ranger)       # for random forests

# Helper:

library(bayesplot)    # for MCMC diagnostic plots
library(patchwork)    # for multi plots

Load in data:

soju <- read_csv('../data/soju_combined_2023_12_13.csv') # New file, added July 8, 2025 while in Eugene

Show all column names:

colnames(soju)
##  [1] "item"                           "id"                            
##  [3] "source"                         "medium"                        
##  [5] "year"                           "year-binned"                   
##  [7] "company"                        "brand"                         
##  [9] "alcohol_content"                "main_slogan"                   
## [11] "secondary_slogan"               "logo_location"                 
## [13] "logo_modality"                  "logo_what_words"               
## [15] "logo_what_image"                "multimodal_logo_type"          
## [17] "metaphoric_metonymy_motivation" "overall_color"                 
## [19] "comments"                       "hanja"                         
## [21] "roman"                          "hangul_loan_words"             
## [23] "text_count"                     "writing_color"                 
## [25] "font_style"                     "font_weight"                   
## [27] "main_slogan_ending"             "slogan_end_noun"               
## [29] "slogan_end_verb"                "slogan_end_english"            
## [31] "slogan_end_nonverbal"           "secondary_slogan_ending"       
## [33] "noun_token"                     "verb_token"                    
## [35] "english_token"                  "nonverbal_speech_sound_token"  
## [37] "additional_comments"            "background_color"              
## [39] "background_type"                "plants_trees_background"       
## [41] "restaurant_background"          "food_background"               
## [43] "additional_comments_background" "bottle_presence"               
## [45] "bottle_position"                "bottle_color"                  
## [47] "bottle_opacity"                 "bottle_shape"                  
## [49] "bottle_size"                    "neck_height_by_width"          
## [51] "body_height_by_width"           "additional_comments_bottle"    
## [53] "model_gender"                   "model_clothing_color"          
## [55] "female_bare_shoulders"          "female_bare_legs"              
## [57] "female_bare_cleavage"           "additional_comments_model"     
## [59] "model_bottle_holding_hand"      "if_holding_bottle_which_hand"  
## [61] "holding_glass"                  "hand_on_hips"                  
## [63] "hourglass_body_shape"           "body_or_facial_appearance"     
## [65] "full_body"                      "camera_shot"                   
## [67] "additional_comments_body"       "eye_contact"                   
## [69] "smile"                          "head_tilt"                     
## [71] "wink"                           "puckered_lips"                 
## [73] "hand_on_face"                   "gesture_comments"              
## [75] "glass_in_advert"                "shape_of_glass"                
## [77] "glass_comments"                 "vertical_RL_writing"           
## [79] "flavoured_soju"                 "flavour"                       
## [81] "ideophone"                      "ideophone_word_count"          
## [83] "sound_object"                   "sound_object_word_count"       
## [85] "production_area"                "overall_comments"

2.2 Define themes, helper functions, and colors:

Set theme for all ggplots in this session:

theme_set(theme_classic() +
            theme(axis.title = element_text(face = 'bold'),
                axis.title.y = element_text(margin = margin(r = 11))))

Create a function, which I’ll call adorn_percentages() in analogy with the janitor package, that first uses count() given some column specifications (...), and then appends a column of percentages.

adorn_percentages <- function(x, ...) {
  df <- count(x, ..., sort = TRUE) |> 
    mutate(prop = n / sum(n),
           prop = round(prop, 2),
           prop = prop * 100,
           p = str_c(prop, '%')) |> 
    select(-prop)
  
  return(df)
}

3 Data cleaning

3.1 Minor stuff

Change the name of the year-binned variable:

soju <- rename(soju,
               year_binned = `year-binned`)

Fix typo of the ‘Califonia (USA)’ in production_area, and of LotTe 롯데 in company:

soju <- mutate(soju,
               production_area = if_else(production_area == 'Califonia (USA)',
                                         'California (USA)',
                                         production_area),
               company = if_else(company == 'LotTe 롯데',
                                 'Lotte 롯데',
                                 company))

3.2 Collapse medium

Make the one Banner ad into Magazine. It’s actually part of a series of ads and the surrounding ones were classified as Magazine. Our research assistant seems to have chosen Banner because of the wide format, but it’s clearly printed in the same medium as the others.

soju <- mutate(soju,
               medium = if_else(medium == 'Banner', 'Magazine', medium))

3.3 Collapse company location

We have a lot of different production areas / locations for companies and want to reduce those categories to a smaller, more manageable and more easily reportable set:

unique(soju$production_area)
##  [1] "Andong (North Gyeongsang Province)"   
##  [2] "Busan Metropolitan City"              
##  [3] "Daejeon Metropolitan City"            
##  [4] "Masan (South Gyeongsang Province)"    
##  [5] "Pocheon (Gyeonggi Province)"          
##  [6] "Kwangju Metropolitan City"            
##  [7] "Seoul Metropolitan City"              
##  [8] "Iksan (North Jeolla Province)"        
##  [9] "Mokpo (South Jeolla Province)"        
## [10] "Namyangju (Gyeonggi Province)"        
## [11] "Mungyeong (North Gyeongsang Province)"
## [12] "Gyeongju (North Gyeongsang Province)" 
## [13] "Daegu Metropolitan City"              
## [14] "Changwon (South Gyeongsang Province)" 
## [15] "Gunsan (North Jeolla Province)"       
## [16] "Changseong (South Jeolla Province)"   
## [17] "California (USA)"                     
## [18] "Gangneung (Gangwon Province)"         
## [19] "Suwon (Gyeonggi Province)"            
## [20] "Pyeongyang (North Korea)"             
## [21] "Cheongju (North Chungcheong Province)"

Let’s reassign:

soju <- soju |> 
  mutate(production_area_red = if_else(str_detect(production_area, 'Gyeongsang'),
                                       'Busan + Daegu + Gyeongsang', production_area),
         production_area_red = if_else(str_detect(production_area_red, 'Jeolla'),
                                       'Kwangju + Jeolla', production_area_red),
         production_area_red = if_else(str_detect(production_area_red, 'Gyeonggi'),
                                       'Seoul + Gyeonggi', production_area_red),
         production_area_red = if_else(str_detect(production_area_red, 'Chungcheong'),
                                       'Daejon + Chungcheong', production_area_red),)

Check:

unique(soju$production_area_red)
##  [1] "Busan + Daegu + Gyeongsang"   "Busan Metropolitan City"     
##  [3] "Daejeon Metropolitan City"    "Seoul + Gyeonggi"            
##  [5] "Kwangju Metropolitan City"    "Seoul Metropolitan City"     
##  [7] "Kwangju + Jeolla"             "Daegu Metropolitan City"     
##  [9] "California (USA)"             "Gangneung (Gangwon Province)"
## [11] "Pyeongyang (North Korea)"     "Daejon + Chungcheong"

We still need to integrate the main cities:

soju <- soju |> 
  mutate(production_area_red = if_else(production_area_red == 'Busan Metropolitan City',
                                       'Busan + Daegu + Gyeongsang', production_area_red),
         production_area_red = if_else(production_area_red == 'Daegu Metropolitan City',
                                       'Busan + Daegu + Gyeongsang', production_area_red),
         production_area_red = if_else(production_area_red == 'Daejeon Metropolitan City',
                                       'Daejon + Chungcheong', production_area_red),
         production_area_red = if_else(production_area_red == 'Seoul Metropolitan City',
                                       'Seoul + Gyeonggi', production_area_red),
         production_area_red = if_else(production_area_red == 'Kwangju Metropolitan City',
                                       'Kwangju + Jeolla', production_area_red))

Check again:

unique(soju$production_area_red)
## [1] "Busan + Daegu + Gyeongsang"   "Daejon + Chungcheong"        
## [3] "Seoul + Gyeonggi"             "Kwangju + Jeolla"            
## [5] "California (USA)"             "Gangneung (Gangwon Province)"
## [7] "Pyeongyang (North Korea)"

3.4 Main slogan and secondary slogan

Check some random main_slogan and secondary_slogan tables.

soju |> 
  sample_n(10) |> 
  select(year, main_slogan)
## # A tibble: 10 × 2
##     year main_slogan                                                            
##    <dbl> <chr>                                                                  
##  1  2010 소주말고 시원주세요! EN: Don't give me soju, give me Cool (Cheongpung)!
##  2  2016 순한시원 16.9 EN: C1 Soft 16.9                                         
##  3  2015 블루, 하실래요? EN: Would you like Blue?                               
##  4  2011 소주… 맛에서 답을 찾다 EN: Soju… the answer is in the taste            
##  5  1994 다릅니다. 참소주는 맛이있습니다 EN: It's different. Charm soju is tast…
##  6  2001 야와로 떠날 때는- 포켓에 쏘~옥 EN: Slip [it] into your pocket when goi…
##  7  2017 좋은날엔 좋은데이 함께해요 EN: Spend good days together with Good Day  
##  8  1997 순하고 부드러운 자연소주 EN: Soft and smooth natural soju              
##  9  2016 순한시원 16.9 EN: C1 Soft 16.9                                         
## 10  2021 초깔끔한 맛 기다렸썸머! EN: I've been waiting for this super clean tas…
soju |>  
  sample_n(10) |> 
  select(year, secondary_slogan)
## # A tibble: 10 × 2
##     year secondary_slogan                                                       
##    <dbl> <chr>                                                                  
##  1  2020 LINK TOGETHER!                                                         
##  2  2021 소주는 맛으로만 마시는 게 아니니까 EN: Soju isn't just about the taste…
##  3  2020 당이 없어 다음날이 깻끗하다! 딱! 무가당 소주! EN: Without sugar so you…
##  4  2016 소주부문 13년 연속 1위 EN: Number 1 in the soju category for 13 years …
##  5  2001 대나무숯으로 2번 걸러 깨끗한 소주 EN: Clean soju filtered twice using …
##  6  2016 <NA>                                                                   
##  7  2012 소주는 깨끗함이다. 소주는 이슬이다. EN: Soju is cleanliness [clarity],…
##  8  2014 초정 청정지역의 천연암반수로 만들어 깨끗하고 부드럽다! EN: Brewed usin…
##  9  1997 깊고 부드러운 맛, 깨끗한 뒤끝! EN: Tastes rich and smooth, and the aft…
## 10  2018 <NA>

We should split the Korean and English into separate columns as this violates tidy data principles (two different values in the same column). Also because we will do some text analyses later.

One row has En: instead of EN: for main_slogan. This will have to be fixed:

soju <- mutate(soju,
               main_slogan = if_else(id == 279,
                                     str_replace(main_slogan, 'En', 'EN'),
                                     main_slogan))

# Check:

filter(soju, id %in% 278:280) |> pull(main_slogan)
## [1] "여름, 가슴까지 시원한 잎술주세요! EN: It's summer, give me some refreshing Yip-sul that will open up my heart!"                                                             
## [2] "함께 나눌게요! EN: I'll share!"                                                                                                                                             
## [3] "한번은 맛을 위해. 두 번은 깨끗한 오늘을 위해. 세 번은 깨끗한 아침을 위해… EN: Once for the flavour, twice for a clean (clear) day, three times for a clean (clear) morning."

There are cases that don’t have EN: in them. These are the ones that only have English text. We’ll deal with this by duplicating the English text as an English translation.

# Extract IDs that are missing EN:

ids <- filter(soju,
              !str_detect(main_slogan, 'EN: ')) |> pull(id)

# Duplicate them:

soju <- mutate(soju,
               main_slogan = if_else(id %in% ids,
                                     str_c(main_slogan, ' EN: ', main_slogan),
                                     main_slogan))

# Check:

filter(soju, id %in% ids) |> 
  pull(main_slogan)
##  [1] "Merry Christmas EN: Merry Christmas"    
##  [2] "Merry Christmas EN: Merry Christmas"    
##  [3] "Merry Christmas EN: Merry Christmas"    
##  [4] "FIGHTING KOREA!!! EN: FIGHTING KOREA!!!"
##  [5] "Merry Christmas EN: Merry Christmas"    
##  [6] "Merry Christmas EN: Merry Christmas"    
##  [7] "Merry Christmas EN: Merry Christmas"    
##  [8] "Merry Christmas EN: Merry Christmas"    
##  [9] "Be Colorful! EN: Be Colorful!"          
## [10] "Be Colorful! EN: Be Colorful!"          
## [11] "DaeSun time EN: DaeSun time"            
## [12] "LINK TOGETHER! EN: LINK TOGETHER!"      
## [13] "BARLEY SOJU EN: BARLEY SOJU"            
## [14] "Sweet & Light EN: Sweet & Light"        
## [15] "Sweet & Light EN: Sweet & Light"

Now we can separate the two columns into Korean and English:

soju <- soju |> 
  separate(main_slogan, sep = 'EN: ',
           into = c('main_slogan_korean', 'main_slogan_english'))

Let’s move on to the secondary slogan. Data point id == 135 is missing a space.

soju <- mutate(soju,
               secondary_slogan = if_else(id == 135,
                                          str_replace(secondary_slogan, 'EN:', 'EN: '),
                                          secondary_slogan))

# Check:

filter(soju, id == 135) |> pull(secondary_slogan)
## [1] "소주도 부드러운 그린이 좋아요 EN: For soju I also like the gentleness of Green."

Two data points have NA NA, which should be set to proper machine-readable NA:

soju <- mutate(soju,
               secondary_slogan = if_else(secondary_slogan == 'NA NA',
                                          NA, secondary_slogan))

# Check:

slice(soju, 265:266) |> pull(secondary_slogan)
## [1] NA NA

Several data points ahve En. instead of EN::

soju <- mutate(soju,
               secondary_slogan = str_replace(secondary_slogan,
                                              'En\\.', 'EN: '))

For the ones where secondary_slogan is English already, double the entry as we did before for the primary slogan so that the translation column is filled:

# Extract IDs that are missing EN:

ids <- filter(soju,
              !str_detect(secondary_slogan, 'EN: ')) |> pull(id)

# Duplicate them:

soju <- mutate(soju,
               secondary_slogan = if_else(id %in% ids,
                                          str_c(secondary_slogan,
                                                ' EN: ', secondary_slogan),
                                          secondary_slogan))

# Check:

filter(soju, id %in% ids) |> 
  pull(secondary_slogan)
##  [1] "Maechui - the Aroma of the Morning Calm EN: Maechui - the Aroma of the Morning Calm"
##  [2] "A: Think Casual EN: A: Think Casual"                                                
##  [3] "A: Think Casual EN: A: Think Casual"                                                
##  [4] "Think Casual EN: Think Casual"                                                      
##  [5] "ORIGINAL NEW-TRO EN: ORIGINAL NEW-TRO"                                              
##  [6] "Think Casual EN: Think Casual"                                                      
##  [7] "Think Casual EN: Think Casual"                                                      
##  [8] "Think Casual EN: Think Casual"                                                      
##  [9] "Think Casual EN: Think Casual"                                                      
## [10] "FOR YOUR SHINING LIFE EN: FOR YOUR SHINING LIFE"                                    
## [11] "FOR YOUR SHINING LIFE EN: FOR YOUR SHINING LIFE"                                    
## [12] "LINK TOGETHER EN: LINK TOGETHER"                                                    
## [13] "LINK TOGETHER! EN: LINK TOGETHER!"                                                  
## [14] "BARLEY SOJU EN: BARLEY SOJU"                                                        
## [15] "New! EN: New!"                                                                      
## [16] "BARLEY SOJU EN: BARLEY SOJU"                                                        
## [17] "JINRO STRAWBERRY EN: JINRO STRAWBERRY"                                              
## [18] "JINRO STRAWBERRY EN: JINRO STRAWBERRY"

Process the secondary slogan similarly:

soju <- soju |> 
  separate(secondary_slogan, sep = 'EN: ',
           into = c('secondary_slogan_korean', 'secondary_slogan_english'))

3.5 Source

There are a bunch of sources that are just different specific urls from the same general website. Like, for example, this one:

soju |> 
  filter(str_detect(source, 'mackiss')) |> 
  select(source)
## # A tibble: 15 × 1
##    source                                     
##    <chr>                                      
##  1 https://www.mackisscompany.co.kr/ijewoolinn
##  2 https://www.mackisscompany.co.kr/ijewoolinn
##  3 https://www.mackisscompany.co.kr/ijewoolinn
##  4 https://www.mackisscompany.co.kr/rinn21    
##  5 https://www.mackisscompany.co.kr/rinn22    
##  6 https://www.mackisscompany.co.kr/rinn23    
##  7 https://www.mackisscompany.co.kr/rinn24    
##  8 https://www.mackisscompany.co.kr/rinn25    
##  9 https://www.mackisscompany.co.kr/rinn26    
## 10 https://www.mackisscompany.co.kr/rinn27    
## 11 https://www.mackisscompany.co.kr/ijewoolinn
## 12 https://www.mackisscompany.co.kr/ijewoolinn
## 13 https://www.mackisscompany.co.kr/ijewoolinn
## 14 https://www.mackisscompany.co.kr/ijewoolinn
## 15 https://www.mackisscompany.co.kr/ijewoolinn

For reporting purposes, it’ll make more sense to contract these sources. The pattern with all of them is that we want everything up to the first link. Like, for https://www.ad.co.kr/ad/print/show.cjsp?ukey=1389869, we will want https://www.ad.co.kr/.

soju <- soju |> 
  mutate(soju, source = str_replace(source, "^(https?://[^/]+).*", "\\1/"))

# Check:

soju |> 
  sample_n(10) |> 
  select(source)
## # A tibble: 10 × 1
##    source                        
##    <chr>                         
##  1 http://www.cbsoju.com/        
##  2 http://www.bohae.co.kr/       
##  3 http://www.bohae.co.kr/       
##  4 https://newslibrary.naver.com/
##  5 https://newslibrary.naver.com/
##  6 http://c1.co.kr/              
##  7 https://www.hitejinro.com/    
##  8 http://c1.co.kr/              
##  9 https://www.hitejinro.com/    
## 10 http://www.bohae.co.kr/

Let’s further contract the different naver sites https://m.blog.naver.com/, https://post.naver.com/, https://newslibrary.naver.com/ etc. We’ll set everything that has the string naver in it to the most common case. We’ll do the same for the multiple daum ones.

soju <- soju |> 
  mutate(source = if_else(str_detect(source, 'naver'),
                          'https://newslibrary.naver.com/', source),
         source = if_else(str_detect(source, 'daum'),
                          'http://blog.daum.net/', source))

3.6 Alcohol content

We have two ads for which we do not know the alcohol content, and haven’t been able to retrieve this information from anywhere.

filter(soju, is.na(alcohol_content)) |> 
  select(id, year, company, brand, alcohol_content)
## # A tibble: 2 × 5
##   id     year company        brand                         alcohol_content
##   <chr> <dbl> <chr>          <chr>                         <chr>          
## 1 47     1975 Lotte 롯데     Ta 타                         <NA>           
## 2 49     1971 BaekGwang 백광 Baekgwang Milkamju 백광밀감주 <NA>

We have a few ads that have more than one alcohol content because there were two different drinks advertised.

filter(soju, str_detect(alcohol_content, '&')) |> 
  select(id, year, company, brand, alcohol_content) |> 
  print(n = Inf)
## # A tibble: 33 × 5
##    id     year company     brand                                 alcohol_content
##    <chr> <dbl> <chr>       <chr>                                 <chr>          
##  1 48     1972 MaSan마산   Muhak 무학                            30&25          
##  2 215    1998 JinRo 진로  Sunhan Jinro 순한 진로 & Jinro Gold … 23&25          
##  3 221    1998 JinRo 진로  Chamjinisulro 참진이슬로 & Jinro Gol… 23&25          
##  4 322    2008 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
##  5 323    2008 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
##  6 336    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
##  7 337    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
##  8 338    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
##  9 339    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 10 340    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 11 341    2009 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 12 387    2015 DaeSun 대선 C1 Blue 시원 블루 & C1 Blue Rose 시…  17.5&15.8      
## 13 388    2015 DaeSun 대선 C1 Blue 시원 블루 & C1 Blue Rose 시…  17.5&15.8      
## 14 390    2015 DaeSun 대선 C1 Blue 시원 블루 로즈 & C1 Blue Gra… 15.8&14        
## 15 391    2015 DaeSun 대선 C1 Blue 시원 블루 로즈 & C1 Blue Gra… 15.8&14        
## 16 532    2019 MuHak 무학  Good Day 좋은 데이 & Good Day Calama… 16.9&12.5      
## 17 533    2019 MuHak 무학  Good Day 좋은 데이 & Good Day Calama… 16.9&12.6      
## 18 553    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 19 554    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 20 555    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 21 556    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 22 557    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 23 558    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 24 559    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 25 560    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 26 561    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 27 562    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 28 563    2010 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 29 569    2011 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 30 570    2011 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 31 571    2011 JinRo 진로  Chamiseul 참이슬                      20.1&19.5      
## 32 573    2012 JinRo 진로  Chamiseul 참이슬                      20.1&19        
## 33 574    2012 JinRo 진로  Chamiseul 참이슬                      20.1&19

We have discussed as a team that the most sensible solution would be to average the two For most of them, the two numbers are very similar anyway. If we didn’t do this, we’d either have to duplicate data points rows for some (which creates redundancy for all variables other than alcohol_content), or we’d have to leave them NA, which would entail unnecessary data loss.

But first, for reporting these cases, let’s check the number and percentage out of the total ads.

filter(soju, str_detect(alcohol_content, '&')) |> 
  select(id, year, company, brand, alcohol_content) |> 
  nrow()
## [1] 33
# Percentage out of total:

33 / nrow(soju)
## [1] 0.04140527

We’ll proceed by defining a helper function average_ampersand() that splits a text vector for ‘&’, then loops through the resulting list using map() to make the list first numeric, then take the mean.

average_ampersand <- function(x) {
  x <- str_split(x, '&') |>
    map(\(x) mean(as.numeric(x))) |> 
    unlist()
  
  return(x)
}

## Apply the function:

soju <- mutate(soju,
               alcohol_content = average_ampersand(alcohol_content))

3.8 Text, slogan and writing features

The text_count variable for id == 81 includes a newspaper text. After discussion, the corrected count should be 52 rather than 842.

soju <- mutate(soju,
               text_count = if_else(text_count == 842, 52, text_count))

For id == 535 and id == 539, the text_count variable is NA, and can safely be set to zero. We checked the ads, and they include text only on the bottle, which we do not count.

soju <- mutate(soju,
               text_count = if_else(id %in% c(535, 539), 0, text_count))

Make hanja, roman and hangul_loan_words variables lower case:

soju <- mutate(soju,
               hanja = str_to_lower(hanja),
               roman = str_to_lower(roman),
               hangul_loan_words = str_to_lower(hangul_loan_words))

Make reduced variables that code for yes/no for Hanja, Roman letters, and the presence of loan words:

soju <- soju |> 
  mutate(hanja_red = if_else(hanja == 'no', 'no', 'yes'),
         roman_red = if_else(roman == 'no', 'no', 'yes'),
         loan_word_red = if_else(hangul_loan_words == 'no', 'no', 'yes'))

Set missing cases to no for loan_word_red and hangul_loan_words:

soju <- soju |> 
  mutate(hangul_loan_words = if_else(is.na(hangul_loan_words), 'no', hangul_loan_words),
         loan_word_red = if_else(is.na(loan_word_red), 'no', loan_word_red))

Create two variables, any_green and any_blue that look at the development of green and blue over time.

soju <- mutate(soju,
               any_green = ifelse(str_detect(writing_color, '(G|g)reen'),
                                  'has green', 'no green'),
               any_blue = ifelse(str_detect(writing_color, '(B|b)lue'),
                                  'has blue', 'no blue'))

We’ll collapse Noun (English) (only 15 cases), Adjective (English) (only 9 cases), and Adverb (English) (only 1 case) to just English.

soju <- mutate(soju,
               main_slogan_ending_red = ifelse(str_detect(main_slogan_ending, 'English'), 'English', main_slogan_ending))

Create categorical identifier variables:

soju <- mutate(soju,
               verb_ending = if_else(main_slogan_ending_red == 'Verb', 'yes', 'no'),
               noun_ending = if_else(main_slogan_ending_red == 'Noun', 'yes', 'no'))

For secondary_slogan, let’s also collapse the (English) cases:

soju <- mutate(soju,
               secondary_slogan_ending_red = if_else(str_detect(secondary_slogan_ending, 'English'), 'English', secondary_slogan_ending))

For the font_style variable, fix typo Calligrahy and make Calligraphy and print into Calligraphy and Print and calligraphy into Print — as detailed in the codebook, the order here reflects the prominence, so Calligraphy and print is mostly calligraphy, and therefore justifies lumping them together.

soju <- mutate(soju,
               font_style = if_else(font_style == 'Calligrahy',
                                    'Calligraphy', font_style),
               font_style_red = case_when(font_style == 'Calligraphy and print' ~ 'Calligraphy',
                                      font_style == 'Print and calligraphy' ~ 'Print',
                                      .default = as.character(font_style)))

Create has_calligraphy binary variable (no need to create a second one since that’s exactly the inverse, so they’d be perfectly correlated in the MCA below):

soju <- mutate(soju,
               has_calligraphy = if_else(font_style_red == 'Calligraphy', 'yes', 'no'))

Since Bold and light for font_weight is only two cases, I’ll collapse that with Bold.

soju <- mutate(soju,
               font_weight = ifelse(font_weight == 'Bold and light',
                                    'Bold', font_weight))

Let’s look at slogan_end_verb, and collapse -nida and -ayo/-eyyeo forms to contaymal, and transform Korean 반말 to translitereated panmal for plotting purposes. We’ll get rid of the four endings that have just one case each. Not worth analyzing these due to the extremely low numbers! For the MCA later, we’ll also create a has_panmal variable:

# Show:

soju |> 
  adorn_percentages(slogan_end_verb)
## # A tibble: 8 × 3
##   slogan_end_verb     n p    
##   <chr>           <int> <chr>
## 1 <NA>              415 52%  
## 2 반말              233 29%  
## 3 ~요                90 11%  
## 4 ~니다              55 7%   
## 5 딱                  1 0%   
## 6 어때                1 0%   
## 7 찰랑                1 0%   
## 8 활짝                1 0%
# Reduce:

soju <- mutate(soju,
               slogan_end_verb_red = case_when(slogan_end_verb == '~요' ~ 'contaymal',
                                               slogan_end_verb == '~니다' ~ 'contaymal',
                                               slogan_end_verb == '반말' ~ 'panmal',
                                               .default = NA),
               has_panmal = if_else(slogan_end_verb_red == 'panmal', 'yes', 'no'))

3.9 Bottle

In the code book, it says Yes* means the drawing of the image of a bottle. Let’s make that more transparent:

soju <- mutate(soju,
               bottle_presence = ifelse(bottle_presence == 'Yes*',
                                        'Drawing',
                                        bottle_presence))

For this, we’ll collapse Drawing, Superimposed and Yes to yes, and No stays no.

soju <- mutate(soju,
               has_bottle = if_else(bottle_presence == 'No', 'no', 'yes'))

Extract bottle body and neck height and width using regular expressions. As regular expressions evaluate greedily, [0-9]+ will pick up 32 out of 32x72, and [0-9]+$ will picj up 72 out of the same string.

soju <- mutate(soju,
               body_height = str_extract(body_height_by_width, '[0-9]+'),
               body_width = str_extract(body_height_by_width, '[0-9]+$'),
               neck_height = str_extract(neck_height_by_width, '[0-9]+'),
               neck_width = str_extract(neck_height_by_width, '[0-9]+$'),
               
               # Convert to numeric:
               
               body_height = as.numeric(body_height),
               body_width = as.numeric(body_width),
               neck_height = as.numeric(neck_height),
               neck_width = as.numeric(neck_width))

# Check:

select(soju, body_height_by_width, body_height:body_width,
       neck_height_by_width, neck_height:neck_width)
## # A tibble: 797 × 6
##    body_height_by_width body_height body_width neck_height_by_width neck_height
##    <chr>                      <dbl>      <dbl> <chr>                      <dbl>
##  1 <NA>                          NA         NA <NA>                          NA
##  2 231x476                      231        476 152x217                      152
##  3 <NA>                          NA         NA <NA>                          NA
##  4 61x117                        61        117 37x50                         37
##  5 32x72                         32         72 16x45                         16
##  6 108x207                      108        207 62x114                        62
##  7 104x285                      104        285 43x83                         43
##  8 25x71                         25         71 10x33                         10
##  9 114x306                      114        306 37x91                         37
## 10 173x464                      173        464 66x144                        66
## # ℹ 787 more rows
## # ℹ 1 more variable: neck_width <dbl>

Create body and neck height by width ratios:

soju <- mutate(soju,
               body_ratio = body_height / body_width,
               neck_ratio = neck_height / neck_width)

Create a variable that is the ratio of ratios:

soju <- mutate(soju,
               neck_over_body_ratio = neck_ratio / body_ratio)

We’ll collapse the bottle_color categories, specifically Olive green and Green, and collapse Green and clear and Black and clear to their respective colors, and also fix the typo Geen. Clear and brown will be mapped onto Clear, and Brown and white to brown. We’ll achieve most of this by getting rid of any and ... bits so that things will always be merged to the first category. Let’s also merge Brown and Black:

soju <- mutate(soju,
               bottle_color = str_replace_all(bottle_color, ' and [a-z]+$', ''),
               bottle_color = if_else(bottle_color == 'Geen', 'Green', bottle_color),
               bottle_color = if_else(bottle_color == 'Olive green', 'Green', bottle_color),
               bottle_color = if_else(bottle_color %in% c('Black', 'Brown'),
                                      'Brown or black', bottle_color))

Create binary variables for MCA later:

soju <- mutate(soju,
               has_green_bottle = if_else(bottle_color == 'Green', 'yes', 'no'),
               has_clear_bottle = if_else(bottle_color == 'Clear', 'yes', 'no'),
               has_black_or_brown_bottle = if_else(bottle_color == 'Brown or black', 'yes', 'no'))

3.10 Model and gender

Make model gender variable lower case:

soju <- mutate(soju,
               model_gender = str_to_lower(model_gender))

Convert missing cases on model_gender to No model: these only show hands, in cartoon style.

soju <- mutate(soju,
               model_gender = if_else(is.na(model_gender), 'no model', model_gender))

Convert model_gender variable into model_gender_red where cases like 1 Female & 1 Male are converted to mixed, and cases like 3 Males are converted to male group etc. We’ll count a non-human model as no model and merge 1 male and 1 male.

soju <- mutate(soju,
               model_gender_red = case_when(model_gender == '1 female & 1 male' ~ 'mixed group',
                                            model_gender == '1 non-human model' ~ 'no model',
                                            model_gender == '1 male' ~ 'male solo',
                                            model_gender == '1 female' ~ 'female solo',
                                            model_gender == '1  male' ~ 'male solo',
                                            model_gender == '1female' ~ 'female solo',
                                            model_gender == '6 females & 6 males' ~ 'mixed group',
                                            model_gender == '5 females & 4 males' ~ 'mixed group',
                                            model_gender == '5 females & 1 males' ~ 'mixed group',
                                            model_gender == '3 females & 3 males' ~ 'mixed group',
                                            model_gender == '3 females & 1 male' ~ 'mixed group',
                                            model_gender == '2 male' ~ 'male group',
                                            model_gender == '2 females & 4 males' ~ 'mixed group',
                                            model_gender == '2 females & 3 males' ~ 'mixed group',
                                            model_gender == '1 male & 1 non-human model' ~ 'male solo',
                                            model_gender == '1 females & 13 males' ~ 'mixed group',
                                            model_gender == '3 males' ~ 'male group',
                                            model_gender == '2 females & 1 male' ~ 'mixed group',
                                            model_gender == 'mixed gender group' ~ 'mixed group',
                                            model_gender == '4 females & 2 males' ~ 'mixed group',
                                            model_gender == '2 females & 2 males' ~ 'mixed group',
                                            model_gender == '1 female & 2 males' ~ 'mixed group',
                                            model_gender == '1 female & 3 males' ~ 'mixed group',
                                            model_gender == '4 females' ~ 'female group',
                                            model_gender == '2 females' ~ 'female group',
                                            model_gender == '2 males' ~ 'male group',
                                            .default = model_gender))

Create binary has_model, has_female, has_group, has_mixed, and has_male variables:

soju <- mutate(soju,
               has_model = if_else(model_gender_red == 'no model', 'no', 'yes'),
               has_female = if_else(model_gender_red %in% c('female solo', 'female group'), 'yes', 'no'),
               has_male = if_else(model_gender_red %in% c('male solo', 'male group'), 'yes', 'no'),
               has_mixed = if_else(model_gender_red == 'mixed group', 'yes', 'no'),
               has_group = if_else(model_gender_red %in% c('mixed group', 'female group', 'male group'), 'yes', 'no'))

3.11 Bottle shape

Check the shapes of the bottle:

soju |> 
  adorn_percentages(bottle_shape) |> 
  print(n = Inf)
## # A tibble: 34 × 3
##    bottle_shape                                                          n p    
##    <chr>                                                             <int> <chr>
##  1 Long neck                                                           523 66%  
##  2 Short neck                                                           92 12%  
##  3 Long thick neck                                                      38 5%   
##  4 <NA>                                                                 29 4%   
##  5 Long neck and convex shoulder                                        22 3%   
##  6 Long neck and long narrow body                                       15 2%   
##  7 Short slanted neck and long narrow body                              15 2%   
##  8 Long and thick neck                                                   7 1%   
##  9 Short neck and long rectangular body                                  7 1%   
## 10 Long neck and concave shoulder                                        5 1%   
## 11 Short neck and long square body                                       4 1%   
## 12 Short neck, long neck and long narrow body                            4 1%   
## 13 Extra long neck and short narrow body                                 3 0%   
## 14 Flask                                                                 3 0%   
## 15 Short neck and long curved body                                       3 0%   
## 16 Long neck and long angular narrow body                                2 0%   
## 17 Long neck and long narrow body, short neck and wide trapezoid bo…     2 0%   
## 18 Long neck and short round body, long neck and short narrow body       2 0%   
## 19 Long neck and wide round body, short wide neck and long round bo…     2 0%   
## 20 Short neck and long round body                                        2 0%   
## 21 Short neck and wide rectangular body                                  2 0%   
## 22 Short neck long rectangular body and long neck                        2 0%   
## 23 Straight neck and convex shoulder                                     2 0%   
## 24 Long neck and short rectangular body                                  1 0%   
## 25 Long neck and wide body                                               1 0%   
## 26 Long neck with patterned glass                                        1 0%   
## 27 Long straight neck                                                    1 0%   
## 28 Rectangle                                                             1 0%   
## 29 Short and thick neck, short neck and long rectangular body            1 0%   
## 30 Short and wide neck                                                   1 0%   
## 31 Short and wide neck and body                                          1 0%   
## 32 Short neck and long round body, short neck and long square body       1 0%   
## 33 Short neck square body, and long neck round bottom                    1 0%   
## 34 Short neck square body, long neck round bottom                        1 0%

Make lowercase:

soju <- mutate(soju,
               bottle_shape = str_to_lower(bottle_shape))

Map them onto short versus long necks:

# Define helper vectors:

long_neck <- c('long neck',
               'long thick neck',
               'long neck and convex shoulder',
               'long neck and long narrow body',
               'long and thick neck',
               'long neck and concave shoulder',
               'extra long neck and short narrow body',
               'long neck and long angular narrow body',
               'long neck and short round body, long neck and short narrow body',
               'long neck and short rectangular body',
               'long neck and wide body',
               'long neck with patterned glass',
               'long straight neck')
short_neck <- c('short neck',
                'short slanted neck and long narrow body',
                'short neck and long rectangular body',
                'short neck and long square body',
                'flask',
                'short neck and long curved body',
                'short neck and long round body',
                'short neck and wide rectangular body',
                'rectangle',
                'short and thick neck, short neck and long rectangular body',
                'short and wide neck',
                'short and wide neck and body',
                'short neck and long round body, short neck and long square body')

# Reduced variable:

soju <- mutate(soju,
               bottle_shape_red = case_when(bottle_shape %in% long_neck ~ 'long neck',
                                            bottle_shape %in% short_neck ~ 'short neck',
                                            .default = NA))

3.12 Plants and trees

For the background variable, create a has_plant that collapses plants and trees:

soju <- soju |>
  mutate(plant_background = if_else(plants_trees_background == 'No', 'no', 'yes'))

3.13 Ideophones and sound objects

Check the ideophone and sound object count variables:

# Ideophone word count:

soju |> 
  filter(!is.na(ideophone_word_count)) |> 
  pull(ideophone_word_count)
##  [1] "1"                          "1"                         
##  [3] "1"                          "2 (지끈지끈 1, 울렁울렁 1)"
##  [5] "1"                          "1"                         
##  [7] "1"                          "2"                         
##  [9] "2"                          "1"                         
## [11] "1"                          "1"                         
## [13] "1"                          "1"                         
## [15] "1"                          "1"                         
## [17] "1"                          "1"                         
## [19] "1"                          "1"                         
## [21] "1"                          "1"                         
## [23] "1"                          "1"                         
## [25] "1"                          "1"                         
## [27] "1"                          "2"                         
## [29] "3"                          "5 (딱 1, 톡 4)"            
## [31] "1"                          "1"                         
## [33] "1"                          "1"                         
## [35] "1"                          "1"                         
## [37] "1"                          "1"                         
## [39] "1"                          "1"                         
## [41] "1"                          "1"                         
## [43] "2"                          "1"                         
## [45] "1"                          "1"                         
## [47] "3"                          "2"
# Sound object word count:

soju |> 
  filter(!is.na(sound_object_word_count)) |> 
  pull(sound_object_word_count)
##  [1] "1"                      "1"                      "1"                     
##  [4] "1"                      "2 (야야야 1, 차차차 1)" "1"                     
##  [7] "2 (ㅇㅋ 1, 앗싸 1)"     "1"                      "1"                     
## [10] "1"                      "4"                      "1"                     
## [13] "1"                      "1"                      "1"                     
## [16] "1"                      "1"                      "1"                     
## [19] "1"                      "1"                      "1"                     
## [22] "1"                      "1"                      "1"                     
## [25] "1"                      "1"                      "3"                     
## [28] "1"                      "1"                      "1"                     
## [31] "1"                      "1"

To have a proper count variable that is coded as numeric, we can just extract the first value, and then convert with as.numeric().

soju <- soju |> 
  mutate(ideophone_word_count = str_sub(ideophone_word_count, 1, 1),
         sound_object_word_count = str_sub(sound_object_word_count, 1, 1),
         ideophone_word_count = as.numeric(ideophone_word_count),
         sound_object_word_count = as.numeric(sound_object_word_count))

# Check ideophone word count:

soju |> 
  filter(!is.na(ideophone_word_count)) |> 
  pull(ideophone_word_count)
##  [1] 1 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 5 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 2 1 1 1 3 2
# Check sound object word count:

soju |> 
  filter(!is.na(sound_object_word_count)) |> 
  pull(sound_object_word_count)
##  [1] 1 1 1 1 2 1 2 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1

Make a binary variable out of this where anything above 1 is 1 (present) and anything else is 0. This makes sense since we have very few ads that have multiple ideophones or sound objects anyway, and these are often repetitions of the same one.

soju <- soju |> 
  mutate(ideophone_red = if_else(is.na(ideophone_word_count), 'no', 'yes'),
         sound_object_red = if_else(is.na(sound_object_word_count), 'no', 'yes'))

4 Overview

4.1 Overall data

How many ads are there in total?

nrow(soju)
## [1] 797

4.2 Source

Where were the ads found?

soju |>
  adorn_percentages(source) |> 
  arrange(desc(n)) |> 
  print(n = Inf)
## # A tibble: 22 × 3
##    source                                 n p    
##    <chr>                              <int> <chr>
##  1 https://newslibrary.naver.com/       233 29%  
##  2 https://www.hitejinro.com/           185 23%  
##  3 http://c1.co.kr/                     119 15%  
##  4 http://www.cbsoju.com/                71 9%   
##  5 http://www.bohae.co.kr/               56 7%   
##  6 https://www.ad.co.kr/                 49 6%   
##  7 https://www.joeunday.co.kr/           20 3%   
##  8 https://www.mackisscompany.co.kr/     15 2%   
##  9 https://happist.com/                  14 2%   
## 10 https://www.fmkorea.com/               8 1%   
## 11 http://blog.daum.net/                  5 1%   
## 12 http://www.mhc.kr/                     5 1%   
## 13 http://pan2world.blogspot.com/         4 1%   
## 14 https://www.nemopan.com/               3 0%   
## 15 http://culture-sshock.tistory.com/     2 0%   
## 16 https://firstsoju.com/                 2 0%   
## 17 http://www.segye.com/                  1 0%   
## 18 https://brunch.co.kr/                  1 0%   
## 19 https://danbis.net/                    1 0%   
## 20 https://data.ad.co.kr/                 1 0%   
## 21 https://magazine.notefolio.net/        1 0%   
## 22 https://www.instiz.net/                1 0%

Write that into a table:

soju |>
  adorn_percentages(source) |> 
  arrange(desc(n)) |> 
  write_excel_csv('../summary_tables/source_counts.csv')

What medium are the ads published in?

soju |> 
  adorn_percentages(medium)
## # A tibble: 3 × 3
##   medium        n p    
##   <chr>     <int> <chr>
## 1 Wall ad     558 70%  
## 2 Newspaper   225 28%  
## 3 Magazine     14 2%

4.3 Data points over time

What’s the range of years?

range(soju$year)
## [1] 1960 2021

How many data points are post-2000?

soju |> 
  mutate(post_2000 = if_else(year > 2000, 'post-2000', 'pre-2000')) |> 
  adorn_percentages(post_2000)
## # A tibble: 2 × 3
##   post_2000     n p    
##   <chr>     <int> <chr>
## 1 post-2000   557 70%  
## 2 pre-2000    240 30%

70% of the data are from after the 2000’s.

What about using 1990 as a split?

soju |> 
  mutate(post_1990 = if_else(year > 1990, 'post-1990', 'pre-1990')) |> 
  adorn_percentages(post_1990)
## # A tibble: 2 × 3
##   post_1990     n p    
##   <chr>     <int> <chr>
## 1 post-1990   713 89%  
## 2 pre-1990     84 11%

We want to create a histogram that shows how many data points we have for each year, and we’ll also add the totals per decade at the bottom. These points should occur in between the tick marks ‘1960’ and ‘1970’, ‘1970’ and ‘1980’, and so on, which means we’ll add +5 to the year_binned variable to plot them halfway in between the tick marks.

Count items per decade and format the tibble so that it can serve as input for the ggplot below.

decade_totals <- soju |> 
  count(year_binned)

# Show:

decade_totals
## # A tibble: 7 × 2
##   year_binned     n
##         <dbl> <int>
## 1        1960    36
## 2        1970    31
## 3        1980    16
## 4        1990   156
## 5        2000   132
## 6        2010   335
## 7        2020    91

What years do we have?

# Plot core:

year_p <- soju |> 
  count(year, sort = TRUE) |> 
  ggplot(aes(x = year, y = n)) +
  geom_vline(xintercept = seq(1960, 2030, 10),
             linetype = 'dashed', size = 1/4,
             col = 'grey') +
  geom_col(fill = 'grey40')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Scales and axes:

year_p <- year_p +
  scale_x_continuous(limits = c(1955, 2029),
                     expand = c(0, 0),
                     breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 70),
                     breaks = seq(0, 60, 10)) +
  xlab('Year') +
  ylab('Number of ads')

# Show and save:

year_p
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).

ggsave('../figures/pdf/year_overview.pdf', year_p,
       width = 6.2, height = 3.4)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).
ggsave('../figures/png/year_overview.png', year_p,
       width = 6.2, height = 3.4)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).

4.4 Companies, brands and locations

Example of company/brand to discuss in the paper:

filter(soju, company == 'JinRo 진로') |>
  count(brand, sort = TRUE)
## # A tibble: 23 × 2
##    brand                                           n
##    <chr>                                       <int>
##  1 Chamisul Fresh 참이슬 프레쉬                   85
##  2 Chamjinisulro 참진이슬로                       43
##  3 Chamiseul 참이슬                               36
##  4 Barrel Aged Premium Soju 참나무통 맑은 소주    22
##  5 Jinro 진로                                     21
##  6 Chamisul 16.9 참이슬 16.9                      15
##  7 Ilpoom Jinro 일품진로                           8
##  8 Chamisul Melon 메로나에 이슬                    6
##  9 Chamisul Classic 참이슬 클래식                  4
## 10 Jinro Gold 진로 골드                            4
## # ℹ 13 more rows

How is it distributed by company? Let’s compute a summary table and save this outside of R. In this table, we’ll also add the brands per company. I’ll use write_excel_csv() below as an easy way to ensure that Korean characters are encoded correctly. We’ll also add the year the first ad occurred.

# Compute counts per company:

company_tab <- soju |> 
  adorn_percentages(company) |> 
  rename(percentage = p)

# Add brands per company:

company_tab <- soju |> 
  count(company, brand) |> 
  count(company) |> 
  rename(brands = n) |> 
  right_join(company_tab) |> 
  relocate(brands, .after = last_col())
## Joining with `by = join_by(company)`
# Add year of first ad in data:

company_tab <- soju |> 
  group_by(company) |> 
  summarize(`first ad` = min(year),
            `last ad` = max(year)) |> 
  right_join(company_tab) |> 
  relocate(`first ad`, .after = last_col()) |> 
  relocate(`last ad`, .after = last_col()) |> 
  arrange(desc(n))
## Joining with `by = join_by(company)`
# Make all column names start with capital letters:

company_tab <- rename_all(company_tab, str_to_title)

# Show and save:

company_tab |> 
  print(n = Inf)
## # A tibble: 32 × 6
##    Company             N Percentage Brands `First Ad` `Last Ad`
##    <chr>           <int> <chr>       <int>      <dbl>     <dbl>
##  1 JinRo 진로        267 34%            23       1970      2021
##  2 DaeSun 대선       132 17%            14       1972      2021
##  3 BoHae 보해         85 11%             9       1982      2021
##  4 ChoongBuk 충북     71 9%              2       2005      2021
##  5 KyeungWeol 경월    52 7%              2       1994      1999
##  6 MuHak 무학         44 6%              5       1996      2021
##  7 Lotte 롯데         33 4%              5       1975      2021
##  8 GeumBok 금복       31 4%              3       1977      2020
##  9 Mackis 마키스      15 2%              2       2018      2021
## 10 SeoGwang 서광      12 2%              1       1960      1968
## 11 BaekHwa 백화        8 1%              1       1970      1989
## 12 DaeHan 대한         8 1%              2       1963      1969
## 13 SamHak 삼학         7 1%              2       1966      1997
## 14 BoBae 보배          5 1%              2       1966      1991
## 15 SunYang 선양        4 1%              3       1993      1997
## 16 Nimbus 님바스       3 0%              1       1996      1997
## 17 Andong 안동         2 0%              1       1995      1995
## 18 CheongRo 청로       2 0%              1       1963      1963
## 19 GyeongJu 경주       2 0%              1       1974      1983
## 20 MaSan마산           2 0%              1       1969      1972
## 21 BaekGwang 백광      1 0%              1       1971      1971
## 22 BaekYang 백양       1 0%              1       1963      1963
## 23 CheonMa 천마        1 0%              1       1961      1961
## 24 DaeGwang 대광       1 0%              1       1967      1967
## 25 DaeJeon 대전        1 0%              1       1963      1963
## 26 GeumGok 금곡        1 0%              1       1968      1968
## 27 GeumSu 금수         1 0%              1       1972      1972
## 28 JoHae 조해          1 0%              1       1979      1979
## 29 LA 엘에이           1 0%              1       1997      1997
## 30 MiSeong 미성        1 0%              1       1964      1964
## 31 Pyeongyang 평양     1 0%              1       1995      1995
## 32 TaePyeong 태평      1 0%              1       1966      1966
write_excel_csv(company_tab, '../summary_tables/company_counts.csv')

32 different companies, with JinRo being the clear winner (34%), but overall still quite balanced.

Worth pointing out that the first three companies are more than 60% of the data:

soju |> 
  count(company, sort = TRUE) |> 
  mutate(prop = n / sum(n)) |> 
  slice_head(n = 3) |> 
  summarize(majority_proportion = sum(prop))
## # A tibble: 1 × 1
##   majority_proportion
##                 <dbl>
## 1               0.607

What about locations?

# Show:

soju |> 
  adorn_percentages(production_area) |> 
  print(n = Inf)
## # A tibble: 21 × 3
##    production_area                           n p    
##    <chr>                                 <int> <chr>
##  1 Seoul Metropolitan City                 323 41%  
##  2 Busan Metropolitan City                 134 17%  
##  3 Changseong (South Jeolla Province)       85 11%  
##  4 Cheongju (North Chungcheong Province)    70 9%   
##  5 Gangneung (Gangwon Province)             52 7%   
##  6 Changwon (South Gyeongsang Province)     45 6%   
##  7 Daegu Metropolitan City                  31 4%   
##  8 Daejeon Metropolitan City                20 3%   
##  9 Gunsan (North Jeolla Province)            9 1%   
## 10 Mokpo (South Jeolla Province)             7 1%   
## 11 Iksan (North Jeolla Province)             5 1%   
## 12 Andong (North Gyeongsang Province)        3 0%   
## 13 Suwon (Gyeonggi Province)                 3 0%   
## 14 Gyeongju (North Gyeongsang Province)      2 0%   
## 15 Masan (South Gyeongsang Province)         2 0%   
## 16 California (USA)                          1 0%   
## 17 Kwangju Metropolitan City                 1 0%   
## 18 Mungyeong (North Gyeongsang Province)     1 0%   
## 19 Namyangju (Gyeonggi Province)             1 0%   
## 20 Pocheon (Gyeonggi Province)               1 0%   
## 21 Pyeongyang (North Korea)                  1 0%
# Externalize:

soju |> 
  adorn_percentages(production_area) |> 
  write_csv('../summary_tables/production_area_detailed.csv')

More coarse locations is more useful for us for reporting so as to not overwhelm the reader with (in this case) rather unnecessary detail:

# Show:

soju |> 
  adorn_percentages(production_area_red) |> 
  print(n = Inf)
## # A tibble: 7 × 3
##   production_area_red              n p    
##   <chr>                        <int> <chr>
## 1 Seoul + Gyeonggi               328 41%  
## 2 Busan + Daegu + Gyeongsang     218 27%  
## 3 Kwangju + Jeolla               107 13%  
## 4 Daejon + Chungcheong            90 11%  
## 5 Gangneung (Gangwon Province)    52 7%   
## 6 California (USA)                 1 0%   
## 7 Pyeongyang (North Korea)         1 0%
# Externalize:

soju |> 
  adorn_percentages(production_area_red) |> 
  write_csv('../summary_tables/production_area.csv')

4.5 Black-and-white versus color ads

The majority seem to be color ads, and not black and white:

soju |> 
  adorn_percentages(overall_color)
## # A tibble: 2 × 3
##   overall_color       n p    
##   <chr>           <int> <chr>
## 1 Color             707 89%  
## 2 Black and white    90 11%

Clearly, the Black and white are more likely to come from the past. Here’s the average year of publication for these two categories:

soju |> 
  group_by(overall_color) |> 
  summarize(year = mean(year))
## # A tibble: 2 × 2
##   overall_color    year
##   <chr>           <dbl>
## 1 Black and white 1976.
## 2 Color           2010.

Yeah, so the black and white ones are much older, as is to be expected. A more sensible quantity to report might be this:

soju |> 
  count(year_binned, overall_color) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2) * 100,
         p = str_c(p, '%'))
## # A tibble: 10 × 4
## # Groups:   year_binned [7]
##    year_binned overall_color       n p    
##          <dbl> <chr>           <int> <chr>
##  1        1960 Black and white    36 100% 
##  2        1970 Black and white    27 87%  
##  3        1970 Color               4 13%  
##  4        1980 Black and white    12 75%  
##  5        1980 Color               4 25%  
##  6        1990 Black and white    15 10%  
##  7        1990 Color             141 90%  
##  8        2000 Color             132 100% 
##  9        2010 Color             335 100% 
## 10        2020 Color              91 100%

When was the last black and white ad published?

soju |> 
  filter(overall_color == 'Black and white') |> 
  arrange(desc(year)) |> 
  select(overall_color, company, brand, year)
## # A tibble: 90 × 4
##    overall_color   company         brand                 year
##    <chr>           <chr>           <chr>                <dbl>
##  1 Black and white LA 엘에이       LA Chaps 엘에이 챕스  1997
##  2 Black and white KyeungWeol 경월 Green 그린            1996
##  3 Black and white DaeSun 대선     C1 시원               1996
##  4 Black and white DaeSun 대선     Amhaengeosa 암행어사  1996
##  5 Black and white BoHae 보해      Kimsatgat 김삿갓      1996
##  6 Black and white BoHae 보해      Kimsatgat 김삿갓      1996
##  7 Black and white BoHae 보해      City 씨티             1995
##  8 Black and white BoHae 보해      City 씨티             1995
##  9 Black and white SamHak 삼학     Samhak 삼학           1995
## 10 Black and white Andong 안동     Andong Soju 안동소주  1995
## # ℹ 80 more rows

Still, quite a few in the 90’s. I think it would be good to know whether these are genuinely black and white, or whether we just happen to have black and white versions of them.

When was the first color ad?

soju |>
  filter(overall_color == 'Color') |> 
  arrange(year) |> 
  select(overall_color, company, brand, year)
## # A tibble: 707 × 4
##    overall_color company      brand           year
##    <chr>         <chr>        <chr>          <dbl>
##  1 Color         JinRo 진로   Jinro 진로      1970
##  2 Color         JinRo 진로   Jinro 진로      1971
##  3 Color         JinRo 진로   Jinro 진로      1975
##  4 Color         BaekHwa 백화 Baekhwa 백화    1976
##  5 Color         BoHae 보해   Bohae 보해      1985
##  6 Color         BoHae 보해   Bohae 보해      1986
##  7 Color         BaekHwa 백화 Baekhwa 백화    1989
##  8 Color         JinRo 진로   Jinro 진로      1989
##  9 Color         JinRo 진로   Jinro 진로      1990
## 10 Color         BoBae 보배   Bobaeho 보배호  1991
## # ℹ 697 more rows

Only the Jinro and BaekHwa ones are colored in the 70’s, and only Bohae and BaekHwa in the 80’s, then all color ones are from later than that. We’ll have to take this into account when thinking about color trends over time later. The information that we can have about color only kicks in much later. We can’t really make strong inferences about color for anything before the 90’s.

4.6 Alcohol content

What are the ads that we do not have the alcohol content for?

filter(soju, is.na(alcohol_content)) |> 
  select(company, brand, year)
## # A tibble: 2 × 3
##   company        brand                          year
##   <chr>          <chr>                         <dbl>
## 1 Lotte 롯데     Ta 타                          1975
## 2 BaekGwang 백광 Baekgwang Milkamju 백광밀감주  1971

Check the distribution of alcohol content, irrespective of time:

alcohol_counts <- soju |> 
  filter(!is.na(alcohol_content)) |>
  count(alcohol_content, sort = TRUE) |> 
  print(n = Inf)
## # A tibble: 37 × 2
##    alcohol_content     n
##              <dbl> <int>
##  1            16.9   163
##  2            25     145
##  3            19.5    98
##  4            23      40
##  5            21      36
##  6            30      34
##  7            19      30
##  8            19.8    28
##  9            17.8    26
## 10            17.5    25
## 11            16.5    24
## 12            20.1    17
## 13            22      14
## 14            35      14
## 15            18      12
## 16            17.2    11
## 17            13      10
## 18            16.8    10
## 19            16.7     8
## 20            14       7
## 21            17.3     6
## 22            18.5     6
## 23            17       5
## 24            20       4
## 25            13.5     3
## 26            16       3
## 27            14.9     2
## 28            15.9     2
## 29            16.6     2
## 30            19.6     2
## 31            24       2
## 32            14.7     1
## 33            14.8     1
## 34            15.8     1
## 35            19.3     1
## 36            27.5     1
## 37            40       1
# Show:

alcohol_counts
## # A tibble: 37 × 2
##    alcohol_content     n
##              <dbl> <int>
##  1            16.9   163
##  2            25     145
##  3            19.5    98
##  4            23      40
##  5            21      36
##  6            30      34
##  7            19      30
##  8            19.8    28
##  9            17.8    26
## 10            17.5    25
## # ℹ 27 more rows

Make a plot of this:

alcohol_counts |> 
  ggplot(aes(x = alcohol_content, y = n)) +
  geom_col(col = 'black') + 
  xlab('Alcohol content') +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 250)) +
  scale_x_continuous(limits = c(0, 40),
                     breaks = seq(0, 40, 5))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

Not the most informative plot, so we won’t work on a publication ready version. That said, something interesting is of note here: We know from the literature on round numbers in advertising and marketing that round numbers are preferred, which explains the peaks at 35, 30, 25, and 20. There is no peak at 15, but then it really peaks at 17. This must have to do with the legal limit, and the desire for campnies to fall just under that.

How many soju ads are about drinks that have at least 30°?

soju |> 
  mutate(hard_liquor = if_else(alcohol_content >= 30, 'hard', 'soft')) |> 
  filter(!is.na(alcohol_content)) |> 
  adorn_percentages(hard_liquor)
## # A tibble: 2 × 3
##   hard_liquor     n p    
##   <chr>       <int> <chr>
## 1 soft          746 94%  
## 2 hard           49 6%

What are the mean and median alcohol contents? And the range?

mean(soju$alcohol_content, na.rm = TRUE)
## [1] 20.45818
median(soju$alcohol_content, na.rm = TRUE)
## [1] 19.5
range(soju$alcohol_content, na.rm = TRUE)
## [1] 13 40

For reporting, what are the companies with the lowest and largest alcohol content?

# Min:

soju |> 
  filter(alcohol_content == min(alcohol_content, na.rm = TRUE)) |> 
  select(year, company, brand, alcohol_content)
## # A tibble: 10 × 4
##     year company    brand                           alcohol_content
##    <dbl> <chr>      <chr>                                     <dbl>
##  1  2015 JinRo 진로 Chamisul Grapefruit 자몽에 이슬              13
##  2  2015 JinRo 진로 Chamisul Grapefruit 자몽에 이슬              13
##  3  2016 JinRo 진로 Chamisul Grape 청포도에 이슬                 13
##  4  2016 JinRo 진로 Chamisul Grape 청포도에 이슬                 13
##  5  2018 JinRo 진로 Chamisul Plum 자두에 이슬                    13
##  6  2018 JinRo 진로 Chamisul Plum 자두에 이슬                    13
##  7  2018 JinRo 진로 Chamisul Plum 자두에 이슬                    13
##  8  2020 JinRo 진로 Chamisul Strawberry 딸기에 이슬              13
##  9  2020 JinRo 진로 Chamisul Strawberry 딸기에 이슬              13
## 10  2021 JinRo 진로 Chamisul Melon 메로나에 이슬                 13
# Max:

soju |> 
  filter(alcohol_content == max(alcohol_content, na.rm = TRUE)) |> 
  select(year, company, brand, alcohol_content)
## # A tibble: 1 × 4
##    year company     brand       alcohol_content
##   <dbl> <chr>       <chr>                 <dbl>
## 1  1972 GeumSu 금수 GeumSu 금수              40

Let’s check for 16.9 and 25 and 19.5 - the three most common types of alcohol contents - the composition of companies.

filter(soju, alcohol_content == 16.9) |> 
  adorn_percentages(company)
## # A tibble: 5 × 3
##   company           n p    
##   <chr>         <int> <chr>
## 1 DaeSun 대선      95 58%  
## 2 MuHak 무학       35 21%  
## 3 JinRo 진로       23 14%  
## 4 Mackis 마키스     6 4%   
## 5 GeumBok 금복      4 2%

So, it’s actually a few different companies. Could this be that 17% was a legal boundary and they try to be just beneath that?

Same for 25:

filter(soju, alcohol_content == 25) |> 
  adorn_percentages(company)
## # A tibble: 12 × 3
##    company             n p    
##    <chr>           <int> <chr>
##  1 KyeungWeol 경월    52 36%  
##  2 JinRo 진로         47 32%  
##  3 BoHae 보해         14 10%  
##  4 DaeSun 대선         8 6%   
##  5 GeumBok 금복        8 6%   
##  6 BaekHwa 백화        7 5%   
##  7 Nimbus 님바스       3 2%   
##  8 SamHak 삼학         2 1%   
##  9 BoBae 보배          1 1%   
## 10 JoHae 조해          1 1%   
## 11 LA 엘에이           1 1%   
## 12 Pyeongyang 평양     1 1%

Also quite a few different ones.

filter(soju, alcohol_content == 19.5) |> 
  adorn_percentages(company)
## # A tibble: 4 × 3
##   company            n p    
##   <chr>          <int> <chr>
## 1 ChoongBuk 충북    54 55%  
## 2 BoHae 보해        23 23%  
## 3 JinRo 진로        20 20%  
## 4 Lotte 롯데         1 1%

Also again four companies… could 20% be another legal boundary.

4.7 Alcohol content: time trend

Let’s look at the alcohol content over time, a simple average plot as a quick-and-dirty first pass.

soju |> 
  group_by(year) |> 
  summarize(alcohol_mean = mean(alcohol_content, na.rm = TRUE)) |> 
  ggplot(aes(x = year, y = alcohol_mean)) +
  geom_line() +
  scale_y_continuous(limits = c(0, 40),
                     breaks = seq(0, 40, 5)) +
  ylab('Average alcohol content')

Fit a Bayesian GAM:

alcohol_mdl <- brm(bf(alcohol_content ~ 1 +
                        s(year) +
                        (1|company)),
                   data = filter(soju, !is.na(alcohol_content)),
                   family = gaussian,
                   
                   # MCMC settings:
                   
                   cores = 4, seed = 42,
                   chains = 4, iter = 6000, warmup = 4000,
                   control = list(adapt_delta = 0.99))

# Save model:

save(alcohol_mdl, file = '../models/alcohol_mdl.RData')

Note: Treedepth warning but diagnostics below show that there’s no divergent transitions and other MCMC diagnostics also look fine.

Bodo action point: Need to investigate and learn about prior choices for GAMs.

Load the model:

load('../models/alcohol_mdl.RData')

Perform hypothesis test for 1960 versus 2020 alcohol content:

# Year 1 posterior samples:

year_1 <- alcohol_mdl |> 
  as_draws_df() |> 
  pull(`s_syear_1[1]`)

# Year 8 posterior samples:

year_8 <- alcohol_mdl |> 
  as_draws_df() |> 
  pull(`s_syear_1[8]`)

# Probability of first time point being smaller than last:

sum(year_1 < year_8) / length(year_1)
## [1] 0.963625

Bodo action point: - Can I actually do this? What does this mean? Need to investigate. - Also need to check the direction in which this is going, i.e., what is 1, and what is 8.

# Extract posterior for bs_syear_1 effect:

year_samples <- alcohol_mdl |> 
  as_draws_df() |> 
  pull(bs_syear_1)

# Check how many of them are negative:

sum(year_samples < 0) / length(year_samples)
## [1] 0.999875

4.7.1 Model diagnostics

Check MCMC trace and density overlays across chains:

mcmc_trace(alcohol_mdl)
mcmc_dens_overlay(alcohol_mdl)

Looks good.

neff_ratio(alcohol_mdl)
##                          b_Intercept                           bs_syear_1 
##                            0.1062995                            0.3655793 
##                sd_company__Intercept                          sds_syear_1 
##                            0.2193374                            0.2294988 
##                                sigma                            Intercept 
##                            0.7035857                            0.1062995 
##     r_company[Andong.안동,Intercept]    r_company[BaekHwa.백화,Intercept] 
##                            0.2601306                            0.1491009 
##   r_company[BaekYang.백양,Intercept]      r_company[BoBae.보배,Intercept] 
##                            0.3717088                            0.1680516 
##      r_company[BoHae.보해,Intercept]   r_company[CheongRo.청로,Intercept] 
##                            0.1120447                            0.2913265 
##    r_company[CheonMa.천마,Intercept]  r_company[ChoongBuk.충북,Intercept] 
##                            0.3958946                            0.1133508 
##   r_company[DaeGwang.대광,Intercept]     r_company[DaeHan.대한,Intercept] 
##                            0.3871676                            0.1576012 
##    r_company[DaeJeon.대전,Intercept]     r_company[DaeSun.대선,Intercept] 
##                            0.3980963                            0.1100550 
##    r_company[GeumBok.금복,Intercept]    r_company[GeumGok.금곡,Intercept] 
##                            0.1173216                            0.3553383 
##     r_company[GeumSu.금수,Intercept]   r_company[GyeongJu.경주,Intercept] 
##                            0.3844292                            0.2933092 
##      r_company[JinRo.진로,Intercept]      r_company[JoHae.조해,Intercept] 
##                            0.1084180                            0.4324010 
## r_company[KyeungWeol.경월,Intercept]       r_company[LA.엘에이,Intercept] 
##                            0.1153453                            0.3585332 
##      r_company[LotTe.롯데,Intercept]   r_company[Mackis.마키스,Intercept] 
##                            0.1188188                            0.1275960 
##       r_company[MaSan마산,Intercept]    r_company[MiSeong.미성,Intercept] 
##                            0.2639563                            0.3910145 
##      r_company[MuHak.무학,Intercept]   r_company[Nimbus.님바스,Intercept] 
##                            0.1145445                            0.2295504 
## r_company[Pyeongyang.평양,Intercept]     r_company[SamHak.삼학,Intercept] 
##                            0.3814630                            0.1540202 
##   r_company[SeoGwang.서광,Intercept]    r_company[SunYang.선양,Intercept] 
##                            0.1657964                            0.1934184 
##  r_company[TaePyeong.태평,Intercept]                         s_syear_1[1] 
##                            0.3678984                            0.3663327 
##                         s_syear_1[2]                         s_syear_1[3] 
##                            0.3515767                            0.7582394 
##                         s_syear_1[4]                         s_syear_1[5] 
##                            0.5854799                            0.5266110 
##                         s_syear_1[6]                         s_syear_1[7] 
##                            0.6042647                            0.3645164 
##                         s_syear_1[8]                               lprior 
##                            0.4356566                            0.2232169 
##                                 lp__ 
##                            0.2003058

Check posterior predictive simulations to assess model plausibility:

pp_check(alcohol_mdl, ndraws = 100)

The model clearly can’t account for all trends. This could be some of the clusters in the data emerging from the brands, but fitting both company and brand random intercepts turns out to be impossible because there’s not enough companies who have multiple brands, i.e., the two encode fairly similar pieces of information that are hard to disentangle when they’re both in the model. It’s still quite good overall, all things considered.

4.7.2 Plot alcohol model and data

Get the marginal effect to put on top of the plot. Save it in an object for plotting and also show:

alcohol_conditional <- conditional_effects(alcohol_mdl)$year

# Show: 

alcohol_conditional
##         year alcohol_content company cond__ effect1__ estimate__      se__
## 1   1960.000        20.45818      NA      1  1960.000   36.03849 1.1300549
## 2   1960.616        20.45818      NA      1  1960.616   35.62077 1.0618326
## 3   1961.232        20.45818      NA      1  1961.232   35.20711 1.0023096
## 4   1961.848        20.45818      NA      1  1961.848   34.78454 0.9511346
## 5   1962.465        20.45818      NA      1  1962.465   34.36562 0.8983701
## 6   1963.081        20.45818      NA      1  1963.081   33.93622 0.8607540
## 7   1963.697        20.45818      NA      1  1963.697   33.49623 0.8326636
## 8   1964.313        20.45818      NA      1  1964.313   33.03630 0.8022554
## 9   1964.929        20.45818      NA      1  1964.929   32.57433 0.7761944
## 10  1965.545        20.45818      NA      1  1965.545   32.10583 0.7500123
## 11  1966.162        20.45818      NA      1  1966.162   31.62701 0.7400761
## 12  1966.778        20.45818      NA      1  1966.778   31.14129 0.7241727
## 13  1967.394        20.45818      NA      1  1967.394   30.65606 0.7011558
## 14  1968.010        20.45818      NA      1  1968.010   30.17654 0.6970931
## 15  1968.626        20.45818      NA      1  1968.626   29.71009 0.6891500
## 16  1969.242        20.45818      NA      1  1969.242   29.26523 0.6770606
## 17  1969.859        20.45818      NA      1  1969.859   28.83524 0.6769051
## 18  1970.475        20.45818      NA      1  1970.475   28.43175 0.6878738
## 19  1971.091        20.45818      NA      1  1971.091   28.05901 0.6980456
## 20  1971.707        20.45818      NA      1  1971.707   27.73255 0.7090874
## 21  1972.323        20.45818      NA      1  1972.323   27.43831 0.7125121
## 22  1972.939        20.45818      NA      1  1972.939   27.17824 0.7166395
## 23  1973.556        20.45818      NA      1  1973.556   26.95412 0.7217377
## 24  1974.172        20.45818      NA      1  1974.172   26.77319 0.7273539
## 25  1974.788        20.45818      NA      1  1974.788   26.62939 0.7348885
## 26  1975.404        20.45818      NA      1  1975.404   26.51861 0.7458050
## 27  1976.020        20.45818      NA      1  1976.020   26.43090 0.7477716
## 28  1976.636        20.45818      NA      1  1976.636   26.36196 0.7461776
## 29  1977.253        20.45818      NA      1  1977.253   26.30711 0.7544092
## 30  1977.869        20.45818      NA      1  1977.869   26.26018 0.7547188
## 31  1978.485        20.45818      NA      1  1978.485   26.21547 0.7565004
## 32  1979.101        20.45818      NA      1  1979.101   26.17596 0.7607823
## 33  1979.717        20.45818      NA      1  1979.717   26.13326 0.7668429
## 34  1980.333        20.45818      NA      1  1980.333   26.08147 0.7708228
## 35  1980.949        20.45818      NA      1  1980.949   26.02288 0.7705435
## 36  1981.566        20.45818      NA      1  1981.566   25.96568 0.7729723
## 37  1982.182        20.45818      NA      1  1982.182   25.90413 0.7671172
## 38  1982.798        20.45818      NA      1  1982.798   25.83210 0.7583060
## 39  1983.414        20.45818      NA      1  1983.414   25.75092 0.7587167
## 40  1984.030        20.45818      NA      1  1984.030   25.67219 0.7547333
## 41  1984.646        20.45818      NA      1  1984.646   25.59310 0.7472968
## 42  1985.263        20.45818      NA      1  1985.263   25.51703 0.7437183
## 43  1985.879        20.45818      NA      1  1985.879   25.44855 0.7458990
## 44  1986.495        20.45818      NA      1  1986.495   25.38760 0.7410526
## 45  1987.111        20.45818      NA      1  1987.111   25.33203 0.7386162
## 46  1987.727        20.45818      NA      1  1987.727   25.28768 0.7410603
## 47  1988.343        20.45818      NA      1  1988.343   25.25801 0.7417062
## 48  1988.960        20.45818      NA      1  1988.960   25.23556 0.7393602
## 49  1989.576        20.45818      NA      1  1989.576   25.21482 0.7351235
## 50  1990.192        20.45818      NA      1  1990.192   25.20279 0.7306557
## 51  1990.808        20.45818      NA      1  1990.808   25.18993 0.7260404
## 52  1991.424        20.45818      NA      1  1991.424   25.17380 0.7229458
## 53  1992.040        20.45818      NA      1  1992.040   25.14522 0.7186122
## 54  1992.657        20.45818      NA      1  1992.657   25.10586 0.7081710
## 55  1993.273        20.45818      NA      1  1993.273   25.04920 0.7047368
## 56  1993.889        20.45818      NA      1  1993.889   24.97790 0.6974113
## 57  1994.505        20.45818      NA      1  1994.505   24.88046 0.6969629
## 58  1995.121        20.45818      NA      1  1995.121   24.75640 0.6918157
## 59  1995.737        20.45818      NA      1  1995.737   24.61038 0.6890199
## 60  1996.354        20.45818      NA      1  1996.354   24.43552 0.6864521
## 61  1996.970        20.45818      NA      1  1996.970   24.23836 0.6819406
## 62  1997.586        20.45818      NA      1  1997.586   24.01512 0.6847644
## 63  1998.202        20.45818      NA      1  1998.202   23.77544 0.6852698
## 64  1998.818        20.45818      NA      1  1998.818   23.51964 0.6860553
## 65  1999.434        20.45818      NA      1  1999.434   23.25272 0.6904965
## 66  2000.051        20.45818      NA      1  2000.051   22.97650 0.6910008
## 67  2000.667        20.45818      NA      1  2000.667   22.69408 0.6911913
## 68  2001.283        20.45818      NA      1  2001.283   22.41366 0.6901239
## 69  2001.899        20.45818      NA      1  2001.899   22.13483 0.6893913
## 70  2002.515        20.45818      NA      1  2002.515   21.86971 0.6896632
## 71  2003.131        20.45818      NA      1  2003.131   21.61540 0.6917050
## 72  2003.747        20.45818      NA      1  2003.747   21.36912 0.6947781
## 73  2004.364        20.45818      NA      1  2004.364   21.13915 0.6973514
## 74  2004.980        20.45818      NA      1  2004.980   20.92804 0.6998227
## 75  2005.596        20.45818      NA      1  2005.596   20.72644 0.6989527
## 76  2006.212        20.45818      NA      1  2006.212   20.53602 0.6956893
## 77  2006.828        20.45818      NA      1  2006.828   20.36110 0.6953641
## 78  2007.444        20.45818      NA      1  2007.444   20.19380 0.6943715
## 79  2008.061        20.45818      NA      1  2008.061   20.03335 0.6917839
## 80  2008.677        20.45818      NA      1  2008.677   19.88046 0.6910024
## 81  2009.293        20.45818      NA      1  2009.293   19.73312 0.6930290
## 82  2009.909        20.45818      NA      1  2009.909   19.59080 0.6909032
## 83  2010.525        20.45818      NA      1  2010.525   19.44928 0.6883569
## 84  2011.141        20.45818      NA      1  2011.141   19.31576 0.6892353
## 85  2011.758        20.45818      NA      1  2011.758   19.18193 0.6894042
## 86  2012.374        20.45818      NA      1  2012.374   19.04826 0.6875871
## 87  2012.990        20.45818      NA      1  2012.990   18.91379 0.6855965
## 88  2013.606        20.45818      NA      1  2013.606   18.78523 0.6876439
## 89  2014.222        20.45818      NA      1  2014.222   18.65875 0.6883763
## 90  2014.838        20.45818      NA      1  2014.838   18.53439 0.6891579
## 91  2015.455        20.45818      NA      1  2015.455   18.41311 0.6890757
## 92  2016.071        20.45818      NA      1  2016.071   18.29819 0.6904254
## 93  2016.687        20.45818      NA      1  2016.687   18.18574 0.6901913
## 94  2017.303        20.45818      NA      1  2017.303   18.07852 0.6889033
## 95  2017.919        20.45818      NA      1  2017.919   17.97516 0.6837959
## 96  2018.535        20.45818      NA      1  2018.535   17.87245 0.6850240
## 97  2019.152        20.45818      NA      1  2019.152   17.77499 0.6861515
## 98  2019.768        20.45818      NA      1  2019.768   17.67712 0.6878777
## 99  2020.384        20.45818      NA      1  2020.384   17.58424 0.6908759
## 100 2021.000        20.45818      NA      1  2021.000   17.48609 0.6939258
##      lower__  upper__
## 1   33.83115 38.25294
## 2   33.56711 37.71216
## 3   33.26007 37.17964
## 4   32.96253 36.66408
## 5   32.61992 36.16456
## 6   32.25473 35.66074
## 7   31.86114 35.16551
## 8   31.47143 34.67572
## 9   31.06529 34.16374
## 10  30.62684 33.64145
## 11  30.18667 33.13348
## 12  29.72300 32.60614
## 13  29.25506 32.08528
## 14  28.80603 31.57398
## 15  28.36484 31.08824
## 16  27.91886 30.62820
## 17  27.49365 30.19450
## 18  27.08435 29.78426
## 19  26.70343 29.44152
## 20  26.34717 29.11448
## 21  26.04005 28.84025
## 22  25.77488 28.60556
## 23  25.54551 28.38130
## 24  25.34737 28.22711
## 25  25.18771 28.08548
## 26  25.08008 27.97656
## 27  24.97791 27.88947
## 28  24.88626 27.83566
## 29  24.81177 27.80167
## 30  24.76326 27.76838
## 31  24.72137 27.73525
## 32  24.67721 27.69443
## 33  24.62553 27.66723
## 34  24.57133 27.61898
## 35  24.52387 27.55646
## 36  24.46168 27.48865
## 37  24.40490 27.41278
## 38  24.33549 27.32386
## 39  24.26410 27.24620
## 40  24.19517 27.15745
## 41  24.11256 27.07394
## 42  24.04692 26.99256
## 43  23.98481 26.92311
## 44  23.92569 26.85063
## 45  23.87362 26.78735
## 46  23.83366 26.74416
## 47  23.80543 26.70697
## 48  23.80222 26.67338
## 49  23.79813 26.64055
## 50  23.79836 26.61373
## 51  23.79914 26.59174
## 52  23.79473 26.55972
## 53  23.78573 26.52124
## 54  23.76283 26.47941
## 55  23.72005 26.41626
## 56  23.64556 26.33312
## 57  23.54853 26.23078
## 58  23.43927 26.10360
## 59  23.29206 25.94511
## 60  23.11317 25.76842
## 61  22.91958 25.57099
## 62  22.68482 25.34841
## 63  22.43929 25.10619
## 64  22.17076 24.84921
## 65  21.90053 24.58566
## 66  21.61903 24.31271
## 67  21.33229 24.03906
## 68  21.04700 23.76350
## 69  20.77603 23.49001
## 70  20.50645 23.21477
## 71  20.25491 22.95888
## 72  20.01533 22.71344
## 73  19.78612 22.48320
## 74  19.56867 22.26207
## 75  19.37521 22.06410
## 76  19.19048 21.87334
## 77  19.01774 21.69771
## 78  18.84832 21.52974
## 79  18.69056 21.37544
## 80  18.54232 21.22317
## 81  18.39572 21.06841
## 82  18.24870 20.92663
## 83  18.10825 20.77981
## 84  17.97646 20.64662
## 85  17.84047 20.50562
## 86  17.70340 20.36188
## 87  17.57256 20.23074
## 88  17.43989 20.10014
## 89  17.30904 19.97328
## 90  17.17647 19.85487
## 91  17.05799 19.74026
## 92  16.94247 19.62375
## 93  16.83519 19.50916
## 94  16.72990 19.39932
## 95  16.63479 19.29579
## 96  16.53655 19.19487
## 97  16.44206 19.10472
## 98  16.33974 19.00842
## 99  16.24234 18.92193
## 100 16.14262 18.84010

Let’s make a better plot with the raw data points:

# Plot core:

alcohol_p <- soju |> 
  ggplot(aes(x = year, y = alcohol_content)) +
  geom_ribbon(data = alcohol_conditional,
              mapping = aes(x = year, ymin = lower__, ymax = upper__),
              fill = 'grey') +
  geom_line(data = alcohol_conditional,
            mapping = aes(x = year, y = estimate__),
            col = 'purple', size = 1.35) +
  geom_jitter(alpha = 0.2,
              width = 0.17,
              height = 0.17)

# Axes and labels:

alcohol_p <- alcohol_p +
  scale_x_continuous(limits = c(1955, 2029),
                     expand = c(0, 0),
                     breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 55),
                     breaks = seq(0, 50, 5),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Alcohol content °')

# Show and save:

alcohol_p
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave('../figures/pdf/alcohol_time_trend.pdf', alcohol_p,
       width = 5.8, height = 3.7)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave('../figures/png/alcohol_time_trend.png', alcohol_p,
       width = 5.8, height = 3.7)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

To give one-number summaries of the overall trend, let’s look at the average for the 1960s and the average for the 2010s.

soju |> 
  group_by(year_binned) |> 
  summarize(M = mean(alcohol_content, na.rm = TRUE))
## # A tibble: 7 × 2
##   year_binned     M
##         <dbl> <dbl>
## 1        1960  31.7
## 2        1970  26.8
## 3        1980  24.8
## 4        1990  24.4
## 5        2000  20.0
## 6        2010  17.7
## 7        2020  17.2

4.8 Flavor

Whether it is flavored or not:

soju |> 
  adorn_percentages(flavoured_soju)
## # A tibble: 2 × 3
##   flavoured_soju     n p    
##   <chr>          <int> <chr>
## 1 No               773 97%  
## 2 Yes               24 3%

From when are these flavored sojus, and which brands are they?

soju |> 
  filter(flavoured_soju == 'Yes') |> 
  select(year, company, brand, alcohol_content, flavour) |> 
  arrange(year) |> 
  print(n = Inf)
## # A tibble: 24 × 5
##     year company     brand                               alcohol_content flavour
##    <dbl> <chr>       <chr>                                         <dbl> <chr>  
##  1  2015 DaeSun 대선 C1 Blue 시원 블루 로즈 & C1 Blue G…            14.9 grapef…
##  2  2015 DaeSun 대선 C1 Blue 시원 블루 로즈 & C1 Blue G…            14.9 grapef…
##  3  2015 DaeSun 대선 C1 Blue Grapefruit 시원 블루 자몽              14   grapef…
##  4  2015 DaeSun 대선 C1 Blue Grapefruit 시원 블루 자몽              14   grapef…
##  5  2015 DaeSun 대선 C1 Blue Lime 시원 블루 라임                    14   lime   
##  6  2015 DaeSun 대선 C1 Blue Lime 시원 블루 라임                    14   lime   
##  7  2015 Lotte 롯데  Sunhari Chumchurum 순하리 처음처럼             14   citron 
##  8  2015 Lotte 롯데  Sunhari Chumchurum 순하리 처음처럼             14   peach  
##  9  2015 Lotte 롯데  Sunhari Chumchurum 순하리 처음처럼             14   citron 
## 10  2015 MuHak 무학  Good Day Colour 좋은 데이 컬러                 13.5 citron…
## 11  2015 MuHak 무학  Good Day Colour 좋은 데이 컬러                 13.5 citron…
## 12  2015 MuHak 무학  Good Day Colour 좋은 데이 컬러                 13.5 peach/…
## 13  2015 JinRo 진로  Chamisul Grapefruit 자몽에 이슬                13   grapef…
## 14  2015 JinRo 진로  Chamisul Grapefruit 자몽에 이슬                13   grapef…
## 15  2016 JinRo 진로  Chamisul Grape 청포도에 이슬                   13   grape  
## 16  2016 JinRo 진로  Chamisul Grape 청포도에 이슬                   13   grape  
## 17  2018 JinRo 진로  Chamisul Plum 자두에 이슬                      13   plum   
## 18  2018 JinRo 진로  Chamisul Plum 자두에 이슬                      13   plum   
## 19  2018 JinRo 진로  Chamisul Plum 자두에 이슬                      13   plum   
## 20  2019 MuHak 무학  Good Day 좋은 데이 & Good Day Cala…            14.7 calama…
## 21  2019 MuHak 무학  Good Day 좋은 데이 & Good Day Cala…            14.8 calama…
## 22  2020 JinRo 진로  Chamisul Strawberry 딸기에 이슬                13   strawb…
## 23  2020 JinRo 진로  Chamisul Strawberry 딸기에 이슬                13   strawb…
## 24  2021 JinRo 진로  Chamisul Melon 메로나에 이슬                   13   melon

The earliest flavored soju is from 2015.

What is the average alcohol content?

soju |> 
  group_by(flavoured_soju) |> 
  summarize(alc_mean = mean(alcohol_content, na.rm = TRUE),
            alc_min = min(alcohol_content, na.rm = TRUE),
            alc_max = max(alcohol_content, na.rm = TRUE))
## # A tibble: 2 × 4
##   flavoured_soju alc_mean alc_min alc_max
##   <chr>             <dbl>   <dbl>   <dbl>
## 1 No                 20.7    15.8    40  
## 2 Yes                13.7    13      14.9

What flavors?

soju |> 
  filter(flavoured_soju == 'Yes') |> 
  adorn_percentages(flavour)
## # A tibble: 11 × 3
##    flavour                          n p    
##    <chr>                        <int> <chr>
##  1 grapefruit                       6 25%  
##  2 plum                             3 12%  
##  3 calamansi                        2 8%   
##  4 citron                           2 8%   
##  5 citron/pomegranate/blueberry     2 8%   
##  6 grape                            2 8%   
##  7 lime                             2 8%   
##  8 strawberry                       2 8%   
##  9 melon                            1 4%   
## 10 peach                            1 4%   
## 11 peach/grapefruit                 1 4%

5 Extracting data for qualitative analysis content

5.1 Sampling for alcohol content across time

For the qualitative analysis, we want to sample 120 ads, 20 per decade, but we also want to make sure we get the extremes in alcohol content.

For the 1980’s we just have 16 ads, so we take all. Let’s exclude the 1980’s so we can use group_by() and sample_n() strategically.

soju_no_1980 <- soju |> 
  filter(year_binned != 1980)

Merge 2010-2020+ since there’s only a few years after 2020:

set.seed(42)

soju_sample <- soju_no_1980 |> 
  filter(!is.na(alcohol_content)) |> 
  mutate(year_binned = if_else(year_binned > 2010, 2010, year_binned)) |> 
  group_by(year_binned) |> 
  sample_n(20)

Reappend the missing 80’s:

soju_sample <- bind_rows(soju_sample,
                         filter(soju, year_binned == 1980))

Make a plot of this:

soju_sample |> 
  ggplot(aes(x = year, y = alcohol_content)) +
  geom_smooth() +
  geom_point() +
  ylim(10, 45) +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Topped up with four of the really high alcohol content ones and low alcohol content ones that are missing. Let’s take two Mackis and two JinRo to balance things out for the high alcohol content, and then 2 random ones from the low alcohol content (they are all the same brand anyway), and we get to 120:

set.seed(1)
low_content <- filter(soju, alcohol_content == 13) |> 
  group_by(year_binned) |> 
  sample_n(size = 2)

# Mackis

set.seed(666)
high_content <- soju |>
  filter(year_binned > 2010,
         alcohol_content > 20) |> 
  group_by(company) |> 
  sample_n(size = 2)

# Reappend:

soju_sample <- bind_rows(soju_sample, high_content) |> 
  bind_rows(low_content)

Double check duplicates:

filter(soju_sample, duplicated(id))
## # A tibble: 0 × 125
## # Groups:   year_binned [0]
## # ℹ 125 variables: item <chr>, id <chr>, source <chr>, medium <chr>,
## #   year <dbl>, year_binned <dbl>, company <chr>, brand <chr>,
## #   alcohol_content <dbl>, main_slogan_korean <chr>, main_slogan_english <chr>,
## #   secondary_slogan_korean <chr>, secondary_slogan_english <chr>,
## #   logo_location <chr>, logo_modality <chr>, logo_what_words <chr>,
## #   logo_what_image <chr>, multimodal_logo_type <chr>,
## #   metaphoric_metonymy_motivation <chr>, overall_color <chr>, …

Good.

Any NAs for alcohol content?

soju_sample[is.na(soju_sample$alcohol_content), ]
## # A tibble: 0 × 125
## # Groups:   year_binned [0]
## # ℹ 125 variables: item <chr>, id <chr>, source <chr>, medium <chr>,
## #   year <dbl>, year_binned <dbl>, company <chr>, brand <chr>,
## #   alcohol_content <dbl>, main_slogan_korean <chr>, main_slogan_english <chr>,
## #   secondary_slogan_korean <chr>, secondary_slogan_english <chr>,
## #   logo_location <chr>, logo_modality <chr>, logo_what_words <chr>,
## #   logo_what_image <chr>, multimodal_logo_type <chr>,
## #   metaphoric_metonymy_motivation <chr>, overall_color <chr>, …

None.

Make a plot of this:

soju_sample |> 
  ggplot(aes(x = year, y = alcohol_content)) +
  geom_smooth() +
  geom_point() +
  ylim(10, 45) +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Get all the file names:

all_files <- str_c('../all_images/', soju_sample$item, '.JPG')

Loop:

for (file in all_files) {
  file.copy(file, '../all_images_sample/')
}
## Warning in file.copy(file, "../all_images_sample/"): problem copying
## ../all_images/217_1998_23_JinRo_Sunhan Jinro_002.JPG to
## ../all_images_sample/217_1998_23_JinRo_Sunhan Jinro_002.JPG: No such file or
## directory
## Warning in file.copy(file, "../all_images_sample/"): problem copying
## ../all_images/579_2012_19_JinRo_Chamisul Fresh_001.JPG to
## ../all_images_sample/579_2012_19_JinRo_Chamisul Fresh_001.JPG: No such file or
## directory

Filename for 217_1998_23_JinRo_Sunhan Jinro_002.JPG is different, so copied it over by hand.

Used 579_2012_19_JinRo_Chamisul Fresh_002.JPG instead of id == 572 (the next one over from the same brand) because 001 also has beer. 217 had to be hand-copied over.

Exchange:

soju_sample[soju_sample$id == 579, ] <- soju[soju$id == 572, ]

Write it to table:

write_csv(soju_sample, '../data/soju_sample_120.csv')

This is the file that will be used for qualitative analysis.

5.2 Other sampling approach

Not merge 2010-2020+ this time around:

set.seed(42)

soju_sample <- soju_no_1980 |> 
  filter(!is.na(alcohol_content)) |> 
  group_by(year_binned) |> 
  sample_n(20)

Now take the soju_1980 and reappend:

soju_sample <- bind_rows(soju_sample,
                         filter(soju, year_binned == 1980))

Get rid of group info:

soju_sample <- soju_sample |> 
  ungroup()

Check:

any(is.na(soju_sample$alcohol_content))
## [1] FALSE
filter(soju_sample,
       duplicated(id))
## # A tibble: 0 × 125
## # ℹ 125 variables: item <chr>, id <chr>, source <chr>, medium <chr>,
## #   year <dbl>, year_binned <dbl>, company <chr>, brand <chr>,
## #   alcohol_content <dbl>, main_slogan_korean <chr>, main_slogan_english <chr>,
## #   secondary_slogan_korean <chr>, secondary_slogan_english <chr>,
## #   logo_location <chr>, logo_modality <chr>, logo_what_words <chr>,
## #   logo_what_image <chr>, multimodal_logo_type <chr>,
## #   metaphoric_metonymy_motivation <chr>, overall_color <chr>, …

Other group:

set.seed(1)
low_content <- filter(soju, alcohol_content == 13) |> 
  group_by(year_binned) |> 
  sample_n(size = 1)

# Mackis

set.seed(666)
high_content <- soju |>
  filter(year_binned > 2010,
         alcohol_content > 20) |> 
  group_by(company) |> 
  sample_n(size = 1)

# Reappend:

soju_sample <- bind_rows(soju_sample, high_content) |> 
  bind_rows(low_content)

Make a plot of this:

soju_sample |> 
  ggplot(aes(x = year, y = alcohol_content)) +
  geom_smooth() +
  geom_point() +
  ylim(10, 45) +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Used 579_2012_19_JinRo_Chamisul Fresh_002.JPG instead of id == 572 (the next one over from the same brand) because 001 also has beer. 217 had to be hand-copied over.

Exchange:

soju_sample[soju_sample$id == 579, ] <- soju[soju$id == 572, ]

Get all the file names:

all_files <- str_c('../all_images/', soju_sample$item, '.JPG')

Loop:

for (file in all_files) {
  file.copy(file, '../all_images_2020sep/')
}
## Warning in file.copy(file, "../all_images_2020sep/"): problem copying
## ../all_images/217_1998_23_JinRo_Sunhan Jinro_002.JPG to
## ../all_images_2020sep/217_1998_23_JinRo_Sunhan Jinro_002.JPG: No such file or
## directory

Filename for 217_1998_23_JinRo_Sunhan Jinro_002.JPG is different, so copied it over by hand.

Write:

write_csv(soju_sample, '../data/soju_sample_2020s_separate.csv')

This is the file that will be used for qualitative analysis.

6 Individual results

In this section, we will look at each variable in turn, providing one-dimensional snapshots of each development over time.

6.1 Textual content and typography

6.1.1 Text count

Look at the range of the text_count variable:

range(soju$text_count, na.rm = TRUE)
## [1]   0 454

Which ones are these extreme points?

# Shortest ads (no words):

filter(soju, text_count == 0) |> 
  select(id, year, company, brand)
## # A tibble: 2 × 4
##   id     year company    brand          
##   <chr> <dbl> <chr>      <chr>          
## 1 535    2011 BoHae 보해 Yipsejoo 잎새주
## 2 539    2011 BoHae 보해 Yipsejoo 잎새주
# Longest ad:

filter(soju, text_count == max(text_count)) |> 
  select(id, year, company, brand)
## # A tibble: 1 × 4
##   id     year company       brand             
##   <chr> <dbl> <chr>         <chr>             
## 1 2      1967 DaeGwang 대광 Diamond 다이야몬드

Check the average number of text_count, grand average, and then across time:

# Grand average:

soju |> 
  summarize(M = mean(text_count))
## # A tibble: 1 × 1
##       M
##   <dbl>
## 1  80.7
# Over time:

soju |> 
  group_by(year_binned) |> 
  summarize(M = mean(text_count))
## # A tibble: 7 × 2
##   year_binned     M
##         <dbl> <dbl>
## 1        1960 130. 
## 2        1970 107. 
## 3        1980 174. 
## 4        1990 155. 
## 5        2000  90.4
## 6        2010  42.1
## 7        2020  36.1

Let’s show the distribution of text counts (histogram):

soju |> 
  count(text_count, sort = TRUE) |> 
  ggplot(aes(x = text_count, y = n)) +
  geom_col(col = 'black') +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 30),
                     breaks = seq(0, 30, 5))

Quite a nice spread.

Calculate and plot the average text count per year.

soju |> 
  group_by(year_binned) |> 
  summarize(text_mean = mean(text_count)) |>
  ggplot(aes(x = year_binned, y = text_mean, group = 1)) +
  geom_line() +
  scale_y_continuous(limits = c(0, 300),
                     breaks = seq(0, 300, 50))

Weird shape. Could be due to binning. Let’s see how the trend looks if we average by continuous year:

soju |> 
  group_by(year) |> 
  summarize(text_mean = mean(text_count)) |>
  ggplot(aes(x = year, y = text_mean, group = 1)) +
  geom_line() +
  scale_y_continuous(limits = c(0, 300),
                     breaks = seq(0, 300, 50))

It definitely looks like a real trend on first pass, but may be worth checking how it’s driven by certain ads. So, first ads had little text, then more, peaking in the 80’s, then less and less text — quite systematically so! — over the last three decades.

Let’s model this using a negative binomial GAMM:

text_count_mdl <- brm(bf(text_count ~ 1 +
                        s(year) +
                        (1|company)),
                   data = soju,
                   family = negbinomial(),
                   
                   # MCMC settings:
                   
                   cores = 4, seed = 42,
                   chains = 4, iter = 6000, warmup = 4000,
                   control = list(adapt_delta = 0.99))

# Save model:

save(text_count_mdl, file = '../models/text_count_mdl.RData')

Load:

load('../models/text_count_mdl.RData')

Check model:

text_count_mdl
##  Family: negbinomial 
##   Links: mu = log; shape = identity 
## Formula: text_count ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     2.44      1.18     0.95     5.42 1.00     2099     3577
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.28      0.10     0.13     0.53 1.00     1682     3579
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     4.24      0.08     4.07     4.41 1.00     2273     2936
## syear_1       1.39      3.58    -4.44     9.55 1.00     2841     3350
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     2.46      0.13     2.23     2.72 1.00     7844     5343
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Get the marginal effect. Save it in an object for plotting and also show:

text_count_conditional <- conditional_effects(text_count_mdl)$year

# Show: 

text_count_conditional
##         year text_count company cond__ effect1__ estimate__      se__   lower__
## 1   1960.000   80.69009      NA      1  1960.000   71.82053 23.215144  33.92851
## 2   1960.616   80.69009      NA      1  1960.616   75.29434 22.206758  38.26815
## 3   1961.232   80.69009      NA      1  1961.232   78.94340 21.021829  43.11173
## 4   1961.848   80.69009      NA      1  1961.848   82.84021 19.862221  47.94220
## 5   1962.465   80.69009      NA      1  1962.465   86.84605 18.970153  53.07727
## 6   1963.081   80.69009      NA      1  1963.081   90.73963 18.055442  58.19113
## 7   1963.697   80.69009      NA      1  1963.697   94.71937 17.333714  63.28953
## 8   1964.313   80.69009      NA      1  1964.313   98.67180 16.817934  67.99032
## 9   1964.929   80.69009      NA      1  1964.929  102.36817 16.368061  72.32119
## 10  1965.545   80.69009      NA      1  1965.545  105.90693 16.014298  76.37674
## 11  1966.162   80.69009      NA      1  1966.162  109.22872 15.846435  80.01703
## 12  1966.778   80.69009      NA      1  1966.778  112.09484 15.534264  83.44643
## 13  1967.394   80.69009      NA      1  1967.394  114.60456 15.322043  86.52841
## 14  1968.010   80.69009      NA      1  1968.010  116.89648 15.169959  88.88290
## 15  1968.626   80.69009      NA      1  1968.626  118.94851 14.819049  91.20780
## 16  1969.242   80.69009      NA      1  1969.242  120.49716 14.713531  92.93415
## 17  1969.859   80.69009      NA      1  1969.859  122.11243 14.810587  94.36404
## 18  1970.475   80.69009      NA      1  1970.475  123.44113 14.916150  95.80319
## 19  1971.091   80.69009      NA      1  1971.091  124.75180 15.271257  96.62249
## 20  1971.707   80.69009      NA      1  1971.707  126.03446 15.551547  97.32944
## 21  1972.323   80.69009      NA      1  1972.323  127.40874 16.078778  97.63824
## 22  1972.939   80.69009      NA      1  1972.939  128.71285 16.825822  97.95898
## 23  1973.556   80.69009      NA      1  1973.556  130.17865 17.456635  98.39847
## 24  1974.172   80.69009      NA      1  1974.172  131.78684 18.379044  98.72257
## 25  1974.788   80.69009      NA      1  1974.788  133.76182 19.183183  99.14179
## 26  1975.404   80.69009      NA      1  1975.404  135.86774 20.097466  99.72073
## 27  1976.020   80.69009      NA      1  1976.020  138.16960 20.925548 100.91349
## 28  1976.636   80.69009      NA      1  1976.636  140.65319 21.794390 101.88443
## 29  1977.253   80.69009      NA      1  1977.253  143.38876 22.724277 103.27694
## 30  1977.869   80.69009      NA      1  1977.869  146.08386 23.493828 104.68042
## 31  1978.485   80.69009      NA      1  1978.485  149.23854 24.147276 106.24856
## 32  1979.101   80.69009      NA      1  1979.101  152.37480 24.849188 107.73467
## 33  1979.717   80.69009      NA      1  1979.717  156.00522 25.566003 110.15567
## 34  1980.333   80.69009      NA      1  1980.333  159.40583 26.153689 112.47551
## 35  1980.949   80.69009      NA      1  1980.949  163.27012 26.799794 115.54424
## 36  1981.566   80.69009      NA      1  1981.566  167.03029 27.340570 118.73054
## 37  1982.182   80.69009      NA      1  1982.182  170.91150 27.750016 122.11132
## 38  1982.798   80.69009      NA      1  1982.798  174.91258 28.054749 125.81425
## 39  1983.414   80.69009      NA      1  1983.414  178.89823 28.522544 129.93125
## 40  1984.030   80.69009      NA      1  1984.030  183.25264 28.643975 134.02514
## 41  1984.646   80.69009      NA      1  1984.646  187.38469 29.204958 138.09020
## 42  1985.263   80.69009      NA      1  1985.263  191.43419 29.473868 142.31706
## 43  1985.879   80.69009      NA      1  1985.879  195.20048 29.507873 146.09821
## 44  1986.495   80.69009      NA      1  1986.495  199.16869 29.591804 150.09418
## 45  1987.111   80.69009      NA      1  1987.111  202.60695 29.543923 153.63578
## 46  1987.727   80.69009      NA      1  1987.727  205.78416 29.581140 157.05816
## 47  1988.343   80.69009      NA      1  1988.343  208.37546 29.450236 160.06170
## 48  1988.960   80.69009      NA      1  1988.960  210.26879 29.471591 162.73283
## 49  1989.576   80.69009      NA      1  1989.576  211.42959 29.202049 164.98773
## 50  1990.192   80.69009      NA      1  1990.192  211.95637 28.379572 166.71959
## 51  1990.808   80.69009      NA      1  1990.808  211.60741 27.824232 167.37455
## 52  1991.424   80.69009      NA      1  1991.424  210.10967 26.833628 167.56084
## 53  1992.040   80.69009      NA      1  1992.040  207.81165 25.685417 166.54978
## 54  1992.657   80.69009      NA      1  1992.657  204.38763 24.576033 164.63674
## 55  1993.273   80.69009      NA      1  1993.273  199.77433 23.189664 162.03488
## 56  1993.889   80.69009      NA      1  1993.889  194.45864 21.842366 159.47780
## 57  1994.505   80.69009      NA      1  1994.505  188.38990 20.338261 155.26797
## 58  1995.121   80.69009      NA      1  1995.121  181.69454 18.863134 150.42895
## 59  1995.737   80.69009      NA      1  1995.737  174.63518 17.555814 145.19388
## 60  1996.354   80.69009      NA      1  1996.354  167.32026 16.396228 139.04012
## 61  1996.970   80.69009      NA      1  1996.970  159.95868 15.635996 132.86204
## 62  1997.586   80.69009      NA      1  1997.586  152.64070 14.941398 126.48023
## 63  1998.202   80.69009      NA      1  1998.202  145.66490 14.534173 119.93286
## 64  1998.818   80.69009      NA      1  1998.818  139.04233 14.288069 113.63472
## 65  1999.434   80.69009      NA      1  1999.434  132.72580 14.037341 107.80322
## 66  2000.051   80.69009      NA      1  2000.051  126.83521 13.721120 102.19384
## 67  2000.667   80.69009      NA      1  2000.667  121.40818 13.380570  97.18578
## 68  2001.283   80.69009      NA      1  2001.283  116.39804 12.762099  92.86650
## 69  2001.899   80.69009      NA      1  2001.899  111.82200 12.265685  89.05205
## 70  2002.515   80.69009      NA      1  2002.515  107.54296 11.680135  85.81704
## 71  2003.131   80.69009      NA      1  2003.131  103.55020 11.028997  82.82468
## 72  2003.747   80.69009      NA      1  2003.747   99.86982 10.457477  80.10774
## 73  2004.364   80.69009      NA      1  2004.364   96.45371  9.902729  78.00673
## 74  2004.980   80.69009      NA      1  2004.980   93.17372  9.462159  75.46355
## 75  2005.596   80.69009      NA      1  2005.596   89.97553  8.992395  73.15805
## 76  2006.212   80.69009      NA      1  2006.212   86.77490  8.614727  70.80118
## 77  2006.828   80.69009      NA      1  2006.828   83.63091  8.142410  68.26548
## 78  2007.444   80.69009      NA      1  2007.444   80.46038  7.822267  65.88451
## 79  2008.061   80.69009      NA      1  2008.061   77.30128  7.475701  63.29866
## 80  2008.677   80.69009      NA      1  2008.677   74.11097  7.128853  60.74383
## 81  2009.293   80.69009      NA      1  2009.293   70.90159  6.846359  58.14901
## 82  2009.909   80.69009      NA      1  2009.909   67.70120  6.493303  55.59476
## 83  2010.525   80.69009      NA      1  2010.525   64.53150  6.126828  53.10896
## 84  2011.141   80.69009      NA      1  2011.141   61.40448  5.784437  50.58637
## 85  2011.758   80.69009      NA      1  2011.758   58.39608  5.466795  48.15998
## 86  2012.374   80.69009      NA      1  2012.374   55.46778  5.210400  45.79246
## 87  2012.990   80.69009      NA      1  2012.990   52.66550  4.959800  43.47787
## 88  2013.606   80.69009      NA      1  2013.606   50.05897  4.728373  41.27172
## 89  2014.222   80.69009      NA      1  2014.222   47.62431  4.521140  39.21761
## 90  2014.838   80.69009      NA      1  2014.838   45.34989  4.333999  37.32581
## 91  2015.455   80.69009      NA      1  2015.455   43.28465  4.121843  35.64600
## 92  2016.071   80.69009      NA      1  2016.071   41.39964  3.876817  34.10861
## 93  2016.687   80.69009      NA      1  2016.687   39.66574  3.665706  32.71281
## 94  2017.303   80.69009      NA      1  2017.303   38.06110  3.462198  31.46126
## 95  2017.919   80.69009      NA      1  2017.919   36.60016  3.278342  30.24610
## 96  2018.535   80.69009      NA      1  2018.535   35.26626  3.155129  29.08575
## 97  2019.152   80.69009      NA      1  2019.152   34.02424  3.068181  27.93144
## 98  2019.768   80.69009      NA      1  2019.768   32.85178  3.043132  26.81573
## 99  2020.384   80.69009      NA      1  2020.384   31.69252  3.072901  25.69048
## 100 2021.000   80.69009      NA      1  2021.000   30.62510  3.168397  24.59143
##       upper__
## 1   133.18844
## 2   132.87695
## 3   132.97888
## 4   132.65834
## 5   133.09679
## 6   133.76820
## 7   134.44125
## 8   135.94417
## 9   138.71111
## 10  141.02021
## 11  143.77882
## 12  146.83879
## 13  149.64160
## 14  151.81639
## 15  153.03842
## 16  154.26471
## 17  155.41448
## 18  157.02071
## 19  158.68888
## 20  160.83663
## 21  163.08941
## 22  166.26522
## 23  168.61078
## 24  171.90185
## 25  175.01395
## 26  178.54171
## 27  182.38124
## 28  186.76870
## 29  191.62398
## 30  196.55615
## 31  202.27686
## 32  207.28579
## 33  212.62053
## 34  218.17741
## 35  223.78143
## 36  229.27507
## 37  235.03515
## 38  240.80069
## 39  245.64625
## 40  251.21024
## 41  257.14570
## 42  263.09377
## 43  268.50396
## 44  273.14045
## 45  277.43703
## 46  279.88510
## 47  281.96900
## 48  283.24212
## 49  283.10876
## 50  281.94955
## 51  279.52760
## 52  275.63597
## 53  270.93062
## 54  264.16366
## 55  255.84531
## 56  247.56601
## 57  237.96041
## 58  228.04555
## 59  217.85251
## 60  207.75726
## 61  198.22528
## 62  189.16612
## 63  180.86207
## 64  172.93034
## 65  165.44822
## 66  158.73979
## 67  152.13210
## 68  146.09372
## 69  140.18238
## 70  134.60982
## 71  129.13375
## 72  124.22002
## 73  119.61564
## 74  115.27436
## 75  111.10150
## 76  107.14998
## 77  103.02487
## 78   98.97585
## 79   94.85762
## 80   91.04257
## 81   87.00833
## 82   83.06910
## 83   79.09084
## 84   75.22045
## 85   71.37636
## 86   67.85649
## 87   64.46600
## 88   61.19054
## 89   58.17233
## 90   55.40551
## 91   52.90070
## 92   50.55315
## 93   48.33618
## 94   46.30167
## 95   44.52516
## 96   42.80642
## 97   41.27479
## 98   39.90750
## 99   38.74411
## 100  37.82808

Let’s make a better plot with the raw data points:

# Plot core:

text_count_p <- soju |> 
  ggplot(aes(x = year)) +
  geom_ribbon(data = text_count_conditional,
              mapping = aes(x = year, ymin = lower__, ymax = upper__),
              fill = 'grey') +
  geom_line(data = text_count_conditional,
            mapping = aes(x = year, y = estimate__),
            col = 'purple', size = 1.35) +
  geom_jitter(mapping = aes(color = medium,
                            y = text_count,
                            shape = medium),
              alpha = 0.5,
              width = 0.17,
              height = 2)

# Axes and labels:

text_count_p <- text_count_p +
  scale_x_continuous(limits = c(1955, 2029),
                     expand = c(0, 0),
                     breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 500),
                     breaks = seq(0, 500, 100),
                     expand = c(0, 0)) +
  scale_shape_manual(values = c(17, 16, 15)) +
  scale_color_manual(values = c('black', 'purple', 'darkgreen')) +
  xlab('Year') +
  ylab('Text count') +
  theme(legend.position = 'top',
        legend.title = element_blank())

# Show and save:

text_count_p
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

ggsave('../figures/pdf/text_count_time_trend.pdf', text_count_p,
       width = 5.8, height = 3.7)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave('../figures/png/text_count_time_trend.png', text_count_p,
       width = 5.8, height = 3.7)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

6.1.2 Font style

6.1.3 Font weight

6.2 Script

6.2.1 Hanja

Is there hanja, i.e., Chinese characters?

soju |> 
  adorn_percentages(hanja) |> 
  print(n = Inf)
## # A tibble: 25 × 3
##    hanja                                                             n p    
##    <chr>                                                         <int> <chr>
##  1 no                                                              551 69%  
##  2 yes, brand                                                       74 9%   
##  3 yes, company                                                     23 3%   
##  4 yes, main slogan,secondary slogan, company and brand             20 3%   
##  5 yes, secondary slogan                                            20 3%   
##  6 yes, main slogan                                                 17 2%   
##  7 yes, brand and scondary slogan                                   15 2%   
##  8 yes, brand and company                                           12 2%   
##  9 yes, main slogan and brand                                        9 1%   
## 10 yes, main slogan, secondary slogan and brand                      9 1%   
## 11 yes, main slogan and secondary slogan                             6 1%   
## 12 yes, main slogan, secondary slogan, brand and company             6 1%   
## 13 yes, main slogan,company and brand                                6 1%   
## 14 yes, company and brand                                            5 1%   
## 15 yes, main slogan,secondary slogan, company, brand and contact     5 1%   
## 16 yes, secondary slogan and brand                                   5 1%   
## 17 yes, main slogan and company                                      3 0%   
## 18 yes, main slogan,company, brand and contact                       2 0%   
## 19 yes, secondary and brand                                          2 0%   
## 20 yes, secondary slogan, company and brand                          2 0%   
## 21 yes, brand and contact                                            1 0%   
## 22 yes, company and contact                                          1 0%   
## 23 yes, contact                                                      1 0%   
## 24 yes, main slogan and secondary slogan.                            1 0%   
## 25 yes, main slogan,secondary slogan, company and contact            1 0%

Easier to just look at overall levels:

soju |> 
  adorn_percentages(hanja_red)
## # A tibble: 2 × 3
##   hanja_red     n p    
##   <chr>     <int> <chr>
## 1 no          551 69%  
## 2 yes         246 31%

We’ll use the yes/no variable for looking at time trends, but can look at whether hanja was first dropped in the main slogan etc. I could imagine that there was a gradual transition away from hanja’s importance over time, with less and less elements containing that.

Look at the time trend for the reduced variable hanja_red (Yes/No) as a proportion and a stacked bar plot:

# Plot core:

hanja_bar_p <- soju |> 
  count(year_binned, hanja_red) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = hanja_red)) +
  geom_col(col = 'black')

# Axes and labels:

hanja_bar_p <- hanja_bar_p +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Show:

hanja_bar_p

Clear temporal trend with less Hanja over time. It’ll be interesting to look at the few modern ones that have Hanja, so the post-2015 ones in there. Perhaps in those ads it’ll have a special purpose? It could signal oldschoolness, for example. Here’s the newest ads with hanja:

soju |> 
  filter(hanja_red == 'yes',
         year > 2015) |> 
  select(id, company, brand, hanja) |> 
  print(n = Inf)
## # A tibble: 39 × 4
##    id    company        brand                               hanja               
##    <chr> <chr>          <chr>                               <chr>               
##  1 398   DaeSun 대선    C1 Soft 시원 소프트                 yes, main slogan    
##  2 420   DaeSun 대선    Dasun 대선                          yes, secondary slog…
##  3 429   DaeSun 대선    C1 Soft 시원 소프트                 yes, secondary slog…
##  4 449   DaeSun 대선    Dasun 대선                          yes, main slogan    
##  5 454   DaeSun 대선    Dasun 대선                          yes, main slogan    
##  6 456   DaeSun 대선    Dasun 대선                          yes, company and br…
##  7 457   DaeSun 대선    Dasun 대선                          yes, company and br…
##  8 459   DaeSun 대선    Dasun 대선                          yes, main slogan an…
##  9 464   DaeSun 대선    Dasun 대선                          yes, brand          
## 10 465   DaeSun 대선    Dasun 대선                          yes, brand          
## 11 690   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 12 706   GeumBok 금복   Soju King 소주왕                    yes, main slogan    
## 13 720   DaeSun 대선    Dasun 대선                          yes, main slogan    
## 14 731   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 15 732   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 16 733   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 17 734   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 18 735   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 19 736   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 20 737   Mackis 마키스  Linn 21 린 21                       yes, secondary slog…
## 21 744   MuHak 무학     Cheongchun 청춘 & Good Day 좋은데이 yes, secondary slog…
## 22 746   MuHak 무학     Cheongchun 청춘 & Good Day 좋은데이 yes, secondary slog…
## 23 759   JinRo 진로     Jinro 진로                          yes, brand          
## 24 760   JinRo 진로     Jinro 진로                          yes, brand          
## 25 761   JinRo 진로     Jinro 진로                          yes, brand          
## 26 762   JinRo 진로     Jinro 진로                          yes, brand          
## 27 763   JinRo 진로     Jinro 진로                          yes, brand          
## 28 775   JinRo 진로     Chamisul Melon 메로나에 이슬        yes, brand          
## 29 776   JinRo 진로     Chamisul Melon 메로나에 이슬        yes, brand          
## 30 777   JinRo 진로     Chamisul Melon 메로나에 이슬        yes, brand          
## 31 778   JinRo 진로     Chamisul Melon 메로나에 이슬        yes, brand          
## 32 779   JinRo 진로     Chamisul Melon 메로나에 이슬        yes, brand          
## 33 791   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 34 792   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 35 793   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 36 794   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 37 795   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 38 796   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand          
## 39 797   ChoongBuk 충북 Cool Cheongpung 시원한 청풍         yes, brand

They are mostly these specific brands. Looking at the ads, it seems that this is desired to give the ads a retro feel.

For reporting, here is proportion of Hanja counted by decade (year_binned):

soju |> 
  count(year_binned, hanja_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(hanja_red != 'no')
## # A tibble: 7 × 5
## # Groups:   year_binned [7]
##   year_binned hanja_red     n     p percentage
##         <dbl> <chr>     <int> <dbl> <chr>     
## 1        1960 yes          36  1    100%      
## 2        1970 yes          27  0.87 87%       
## 3        1980 yes          12  0.75 75%       
## 4        1990 yes          62  0.4  40%       
## 5        2000 yes          66  0.5  50%       
## 6        2010 yes          16  0.05 5%        
## 7        2020 yes          27  0.3  30%

Build a model of p(y = has hanja), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               hanja_red = factor(hanja_red, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

hanja_mdl <- brm(bf(hanja_red ~ 1 +
                        s(year) +
                        (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(hanja_mdl, file = '../models/hanja_mdl.RData')

Load:

load('../models/hanja_mdl.RData')

Show posterior predictive simulations:

pp_check(hanja_mdl, ndraws = 100)

pp_check(hanja_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

hanja_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: hanja_red ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)    24.88      8.18    13.26    44.90 1.00     3224     4412
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.45      0.50     0.74     2.67 1.00     2181     4264
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.97      0.46    -1.81     0.04 1.00     2598     3700
## syear_1     -15.39     38.35  -103.10    47.57 1.00     2964     3396
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

hanja_mdl_df <- conditional_effects(hanja_mdl)$year

Make a plot of the curve:

# Plot core:

hanja_p <- hanja_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

hanja_p <- hanja_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1.05),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Proportion of Hanja')

# Show and save:

hanja_p

ggsave('../figures/pdf/hanja.pdf', hanja_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/hanja.png', hanja_p,
       width = 5.8, height = 3.7)

6.2.2 Roman letters

What about the presence of roman letters?

soju |> 
  adorn_percentages(roman) |> 
  print(n = Inf)
## # A tibble: 40 × 3
##    roman                                                              n p    
##    <chr>                                                          <int> <chr>
##  1 no                                                               210 26%  
##  2 yes, brand                                                       159 20%  
##  3 yes, company                                                      96 12%  
##  4 yes, secondary slogan                                             58 7%   
##  5 yes, company and brand                                            50 6%   
##  6 yes, brand and contact                                            45 6%   
##  7 yes, contact                                                      20 3%   
##  8 yes, main slogan                                                  20 3%   
##  9 yes, secondary slogan, company and brand                          14 2%   
## 10 yes, brand and company                                            12 2%   
## 11 yes, brand and secondary slogan                                   11 1%   
## 12 yes, company and contact                                          11 1%   
## 13 yes, company and event                                            10 1%   
## 14 yes, secondary slogan and brand                                   10 1%   
## 15 yes, main slogan and company                                       7 1%   
## 16 yes, main slogan, secondary slogan and brand                       6 1%   
## 17 yes, main slogan, secondary slogan, company and brand              6 1%   
## 18 yes, secondary slogan and company                                  6 1%   
## 19 yes, event                                                         5 1%   
## 20 yes, main slogan and brand                                         5 1%   
## 21 yes, main slogan, comapny and brand                                5 1%   
## 22 yes, main slogan, secondary slogan and company                     4 1%   
## 23 yes, company, brand and contact                                    3 0%   
## 24 yes, main slogan and secondary slogan                              3 0%   
## 25 yes, brand, secondary slogan and contact                           2 0%   
## 26 yes, company, brand and event                                      2 0%   
## 27 yes, event and contact                                             2 0%   
## 28 yes, secondary and contact                                         2 0%   
## 29 yes, secondary slogan and contact                                  2 0%   
## 30 yes, brand and event                                               1 0%   
## 31 yes, logo                                                          1 0%   
## 32 yes, main slogan and contact                                       1 0%   
## 33 yes, main slogan, company and brand                                1 0%   
## 34 yes, main slogan, company, brand and contact                       1 0%   
## 35 yes, main slogan, secondary slogan, company, brand and contact     1 0%   
## 36 yes, secondary slogan and event                                    1 0%   
## 37 yes, secondary slogan, company and contact                         1 0%   
## 38 yes, secondary slogan, company, brand and contact                  1 0%   
## 39 yes, secondary slogan, event and contact                           1 0%   
## 40 yes, volume                                                        1 0%

Same as for hanja variable, we may want to look at the subcategories of yes cases later. Like, could it be interesting to see the continuous “creep” of Roman characters into different aspects of the ads.

soju |> 
  adorn_percentages(roman_red)
## # A tibble: 2 × 3
##   roman_red     n p    
##   <chr>     <int> <chr>
## 1 yes         587 74%  
## 2 no          210 26%

Anyway, to look at time trends, using the Yes/No variable roman_red:

soju |> 
  count(year_binned, roman_red) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = roman_red)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Proportionally definitely much more roman over time, although perhaps a bit back in 2015-2025. Perhaps a backlash against modernization and wanting to be more Korean again, in pursuit of a “retro” feel.

What are the earliest ads with Roman letters?

soju |> 
  filter(roman_red == 'yes') |> 
  arrange(year) |> 
  select(id, company, brand, year, hanja_red, roman_red)
## # A tibble: 587 × 6
##    id                          company       brand      year hanja_red roman_red
##    <chr>                       <chr>         <chr>     <dbl> <chr>     <chr>    
##  1 6                           BaekYang 백양 BaekYang…  1963 yes       yes      
##  2 22                          SeoGwang 서광 Jinro 진…  1963 yes       yes      
##  3 5                           MiSeong 미성  MiSeong …  1964 yes       yes      
##  4 23                          SeoGwang 서광 Jinro 진…  1964 yes       yes      
##  5 24                          SeoGwang 서광 Jinro 진…  1964 yes       yes      
##  6 27                          SeoGwang 서광 Jinro 진…  1965 yes       yes      
##  7 2                           DaeGwang 대광 Diamond …  1967 yes       yes      
##  8 11                          DaeHan 대한   Baekhwa …  1967 yes       yes      
##  9 31                          SeoGwang 서광 Jinro 진…  1967 yes       yes      
## 10 vocal_utterance_word_number GeumGok 금곡  Geumgok …  1968 yes       yes      
## # ℹ 577 more rows

For reporting, here is proportion of Hanja counted by decade (year_binned):

soju |> 
  count(year_binned, roman_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(roman_red != 'no')
## # A tibble: 7 × 5
## # Groups:   year_binned [7]
##   year_binned roman_red     n     p percentage
##         <dbl> <chr>     <int> <dbl> <chr>     
## 1        1960 yes          12  0.33 33%       
## 2        1970 yes          10  0.32 32%       
## 3        1980 yes          10  0.62 62%       
## 4        1990 yes         140  0.9  90%       
## 5        2000 yes         126  0.95 95%       
## 6        2010 yes         225  0.67 67%       
## 7        2020 yes          64  0.7  70%

Is there any relation between having hanja and roman characters?

soju |> 
  count(hanja_red, roman_red) |> 
  group_by(hanja_red) |> 
  mutate(prop = n / sum(n))
## # A tibble: 4 × 4
## # Groups:   hanja_red [2]
##   hanja_red roman_red     n  prop
##   <chr>     <chr>     <int> <dbl>
## 1 no        no          137 0.249
## 2 no        yes         414 0.751
## 3 yes       no           73 0.297
## 4 yes       yes         173 0.703

Not too much, but a little bit. Those ads that have no hanja, have a slightly higher percentage of Roman letters (75% as opposed to 71%).

# Factor-code the hanja variable:

soju <- mutate(soju,
               roman_red = factor(roman_red, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

roman_mdl <- brm(bf(roman_red ~ 1 +
                      s(year) +
                      (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(roman_mdl, file = '../models/roman_mdl.RData')

Load model:

load('../models/roman_mdl.RData')

Check model:

roman_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: roman_red ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     8.79      3.45     4.13    17.12 1.00     3175     4661
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.35      0.37     0.80     2.24 1.00     2199     4107
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     1.22      0.37     0.48     1.96 1.00     2252     3671
## syear_1      -3.48     11.11   -23.20    20.95 1.00     3037     4061
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Show posterior predictive simulations:

pp_check(roman_mdl, ndraws = 100)

Extract conditional effects for plotting:

roman_mdl_df <- conditional_effects(roman_mdl)$year

Make a plot of the curve:

# Plot core:

roman_p <- roman_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

roman_p <- roman_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of Roman letters')

# Show and save:

roman_p

ggsave('../figures/pdf/roman.pdf', roman_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/roman.png', roman_p,
       width = 5.8, height = 3.7)

6.2.3 Hangul loan words

What about Hangul loan words?

soju |> 
  count(hangul_loan_words, sort = TRUE)
## # A tibble: 16 × 2
##    hangul_loan_words                         n
##    <chr>                                 <int>
##  1 no                                      529
##  2 yes, secondary slogan                   148
##  3 yes, main slogan                         34
##  4 yes, brand                               27
##  5 yes, main slogan and secondary slogan    18
##  6 yes, event                               12
##  7 yes, company                              8
##  8 yes, main slogan and brand                7
##  9 yes, secondary slogan and brand           6
## 10 yes, secondary slogan and company         2
## 11 yes, brand and event                      1
## 12 yes, company and brand                    1
## 13 yes, contact                              1
## 14 yes, main slogan andsecondary slogan      1
## 15 yes, secondary slogan andn brand          1
## 16 yes, secondary slogan andn event          1

Check Hangul loan words with the loan_word_red variable:

soju |> 
  adorn_percentages(loan_word_red)
## # A tibble: 2 × 3
##   loan_word_red     n p    
##   <chr>         <int> <chr>
## 1 no              529 66%  
## 2 yes             268 34%

Make a temporal trend plot for the loan words, using loan_word_red:

soju |> 
  count(year_binned, loan_word_red) |> 
  filter(!is.na(loan_word_red)) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = loan_word_red)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Definitely more loan words over time. Very few proportionally in the early years. Perhaps zoom into those and see what’s going on there? What’s the earliest years that have loan words?

soju |> 
  filter(loan_word_red == 'yes') |> 
  arrange(year) |> 
  select(id, company, brand, year)
## # A tibble: 268 × 4
##    id                          company        brand                    year
##    <chr>                       <chr>          <chr>                   <dbl>
##  1 28                          SeoGwang 서광  Jinro 진로               1965
##  2 15                          BoBae 보배     Bobae 보배               1966
##  3 36                          TaePyeong 태평 007Soju 007소주          1966
##  4 2                           DaeGwang 대광  Diamond 다이야몬드       1967
##  5 vocal_utterance_word_number GeumGok 금곡   Geumgok 금곡             1968
##  6 13                          DaeHan 대한    Baekhwa Subok 백화 수복  1968
##  7 16                          SamHak 삼학    Samhak 삼학              1968
##  8 17                          SamHak 삼학    Samhak 삼학              1968
##  9 59                          JinRo 진로     Jinro 진로               1970
## 10 62                          JinRo 진로     Jinro 진로               1970
## # ℹ 258 more rows

It’s really very few pre-70’s, so worth having a look at to see what’s going on with these ads that are first-movers for loan words.

Let’s compute the time trend for reporting like this:

soju |> 
  count(year_binned, loan_word_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(loan_word_red != 'no')
## # A tibble: 7 × 5
## # Groups:   year_binned [7]
##   year_binned loan_word_red     n     p percentage
##         <dbl> <chr>         <int> <dbl> <chr>     
## 1        1960 yes               8  0.22 22%       
## 2        1970 yes               9  0.29 29%       
## 3        1980 yes               4  0.25 25%       
## 4        1990 yes              48  0.31 31%       
## 5        2000 yes              59  0.45 45%       
## 6        2010 yes             106  0.32 32%       
## 7        2020 yes              34  0.37 37%

I suspect there may be stronger relations between presence of Roman and loan words?

soju |> 
  count(roman_red, loan_word_red) |> 
  group_by(roman_red) |> 
  mutate(prop = n / sum(n))
## # A tibble: 4 × 4
## # Groups:   roman_red [2]
##   roman_red loan_word_red     n  prop
##   <chr>     <chr>         <int> <dbl>
## 1 no        no              164 0.781
## 2 no        yes              46 0.219
## 3 yes       no              365 0.622
## 4 yes       yes             222 0.378

Yes, so, those ads that have Roman characters are also proportionally speaking much more likely to contain loan words (38%) than those ads that don’t have Roman characters (22%).

Whata bout the relationship between loan words and hanja?

soju |> 
  count(hanja_red, loan_word_red) |> 
  group_by(hanja_red) |> 
  mutate(prop = n / sum(n))
## # A tibble: 4 × 4
## # Groups:   hanja_red [2]
##   hanja_red loan_word_red     n  prop
##   <chr>     <chr>         <int> <dbl>
## 1 no        no              355 0.644
## 2 no        yes             196 0.356
## 3 yes       no              174 0.707
## 4 yes       yes              72 0.293

Mmmh, this seems a bit counter-intuitive. So, those ads that don’t have hanja have proportionally more loan words (36%) than those words that do (29%)? Could this be that this is because the more “oldschool” ads that include hanja are also more likely to spell loan words in Hangul, rather than using Roman letters?

hanja_roman_loan_tab <- with(filter(soju, !is.na(roman_red),
            !is.na(loan_word_red)),
     table(hanja_red, loan_word_red, roman_red))

hanja_roman_loan_tab
## , , roman_red = no
## 
##          loan_word_red
## hanja_red  no yes
##       no  106  31
##       yes  58  15
## 
## , , roman_red = yes
## 
##          loan_word_red
## hanja_red  no yes
##       no  249 165
##       yes 116  57

Action point 13: Think about the three-way relationship between hanja ~ roman letters ~ loan words.

Build a model of p(y = has loan word), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               loan_word_red = factor(loan_word_red, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

loan_mdl <- brm(bf(loan_word_red ~ 1 +
                     s(year) +
                     (1|company)),
                data = soju,
                family = bernoulli,
                 
                # MCMC settings:
                   
                cores = 4, seed = 42,
                chains = 4, iter = 6000, warmup = 4000,
                control = list(adapt_delta = 0.99,
                               max_treedepth = 12))

# Save model:

save(loan_mdl, file = '../models/loan_mdl.RData')

Bodo action point: Need to get rid of 13(!!) divergent transitions in MCMC sampling.

Load:

load('../models/loan_mdl.RData')

Show posterior predictive simulations:

pp_check(loan_mdl, ndraws = 100)

pp_check(loan_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

loan_mdl
## Warning: There were 13 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: loan_word_red ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     3.90      3.31     0.28    12.99 1.00     1479     2243
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.18      0.34     0.66     1.97 1.00     1862     2767
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.69      0.33    -1.37    -0.08 1.00     2173     3048
## syear_1       3.11      9.19    -9.92    27.42 1.01     2303     1929
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

loan_mdl_df <- conditional_effects(loan_mdl)$year

Make a plot of the curve:

# Plot core:

loan_p <- loan_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

loan_p <- loan_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of loan word')

# Show and save:

loan_p

ggsave('../figures/pdf/loan.pdf', loan_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/loan.png', loan_p,
       width = 5.8, height = 3.7)

6.2.4 Writing direction

Let’s look at vertical left/right writing, vertical_RL_writing:

soju |> 
  adorn_percentages(vertical_RL_writing) |> 
  print(n = Inf)
## # A tibble: 3 × 3
##   vertical_RL_writing     n p    
##   <chr>               <int> <chr>
## 1 No                    754 95%  
## 2 Yes                    39 5%   
## 3 No (vertical_LR)        4 1%

What are the vertical right-left ones, that is, the traditional writing direction?

soju |> 
  filter(vertical_RL_writing == 'Yes') |> 
  select(id, year, company, brand) |> 
  arrange(desc(year)) |> 
  print(n = Inf)
## # A tibble: 39 × 4
##    id     year company        brand                       
##    <chr> <dbl> <chr>          <chr>                       
##  1 499    2019 Mackis 마키스  Ijewoolinn 이제우린         
##  2 704    2019 GeumBok 금복   Charm Soju 참소주           
##  3 705    2019 GeumBok 금복   Charm Soju 참소주           
##  4 420    2017 DaeSun 대선    Dasun 대선                  
##  5 423    2017 DaeSun 대선    Dasun 대선                  
##  6 696    2011 GeumBok 금복   Charm Soju 참소주           
##  7 229    1998 JinRo 진로     Chamjinisulro 참진이슬로    
##  8 182    1995 Andong 안동    Andong Soju 안동소주        
##  9 183    1995 Andong 안동    Andong Soju 안동소주        
## 10 187    1992 JinRo 진로     Biseon 비선                 
## 11 71     1986 BaekHwa 백화   Baekhwa 백화                
## 12 74     1986 BoHae 보해     Bohae 보해                  
## 13 80     1982 BoHae 보해     Bohae Maechwi  보해 매취    
## 14 58     1979 JoHae 조해     Johae 조해                  
## 15 45     1978 DaeSun 대선    Daesun 대선                 
## 16 41     1972 DaeSun 대선    Daesun 대선                 
## 17 63     1971 JinRo 진로     Jinro 진로                  
## 18 14     1969 DaeHan 대한    Baekhwa Subok 백화 수복     
## 19 12     1968 DaeHan 대한    Baekhwa 백화                
## 20 13     1968 DaeHan 대한    Baekhwa Subok 백화 수복     
## 21 19     1967 SamHak 삼학    Samhak Daewangpyo 삼학대왕표
## 22 9      1966 DaeHan 대한    Baekhwa 백화                
## 23 10     1966 DaeHan 대한    Baekhwa 백화                
## 24 15     1966 BoBae 보배     Bobae 보배                  
## 25 18     1966 SamHak 삼학    Samhak Daewangpyo 삼학대왕표
## 26 29     1966 SeoGwang 서광  Jinro 진로                  
## 27 36     1966 TaePyeong 태평 007Soju 007소주             
## 28 27     1965 SeoGwang 서광  Jinro 진로                  
## 29 28     1965 SeoGwang 서광  Jinro 진로                  
## 30 8      1964 DaeHan 대한    Baekhwa 백화                
## 31 23     1964 SeoGwang 서광  Jinro 진로                  
## 32 24     1964 SeoGwang 서광  Jinro 진로                  
## 33 25     1964 SeoGwang 서광  Jinro 진로                  
## 34 3      1963 DaeJeon 대전   Chungseong 충성             
## 35 6      1963 BaekYang 백양  BaekYang 백양               
## 36 7      1963 DaeHan 대한    Baekhwa 백화                
## 37 22     1963 SeoGwang 서광  Jinro 진로                  
## 38 34     1963 CheongRo 청로  Cheongro 청로               
## 39 35     1963 CheongRo 청로  Cheongro 청로

What are the vertical left-to-right ones?

soju |> 
  filter(vertical_RL_writing == 'No (vertical_LR)') |> 
  select(id, year, company, brand)
## # A tibble: 4 × 4
##   id     year company        brand                      
##   <chr> <dbl> <chr>          <chr>                      
## 1 419    2017 DaeSun 대선    Dasun 대선                 
## 2 472    2019 DaeSun 대선    Gogeup Soju 고급소주       
## 3 473    2019 DaeSun 대선    Gogeup Soju 고급소주       
## 4 685    2018 ChoongBuk 충북 Cool Cheongpung 시원한 청풍

We will make both No’s into a singular category, vertical writing direction versus horizontal:

soju <- soju |> 
  mutate(vertical_red = if_else(vertical_RL_writing == 'No',
                                'horizontal', 'vertical'))

# Check:

soju |> 
  adorn_percentages(vertical_red)
## # A tibble: 2 × 3
##   vertical_red     n p    
##   <chr>        <int> <chr>
## 1 horizontal     754 95%  
## 2 vertical        43 5%

Anyway, to look at time trends, using the Yes/No variable roman_red:

soju |> 
  count(year_binned, vertical_red) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = vertical_red)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The vertical writing orientation was present in the 1960’s and then really drops off.

For reporting, here is proportion of horizontal by decade (year_binned):

soju |> 
  count(year_binned, vertical_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%'))
## # A tibble: 12 × 5
## # Groups:   year_binned [7]
##    year_binned vertical_red     n     p percentage
##          <dbl> <chr>        <int> <dbl> <chr>     
##  1        1960 horizontal      14  0.39 39%       
##  2        1960 vertical        22  0.61 61%       
##  3        1970 horizontal      27  0.87 87%       
##  4        1970 vertical         4  0.13 13%       
##  5        1980 horizontal      13  0.81 81%       
##  6        1980 vertical         3  0.19 19%       
##  7        1990 horizontal     152  0.97 97%       
##  8        1990 vertical         4  0.03 3%        
##  9        2000 horizontal     132  1    100%      
## 10        2010 horizontal     325  0.97 97%       
## 11        2010 vertical        10  0.03 3%        
## 12        2020 horizontal      91  1    100%

Build a model of the vertical ones:

# Factor-code the hanja variable:

soju <- mutate(soju,
               vertical_red = factor(vertical_red, levels = c('horizontal', 'vertical')))

# Generalized additive logistic regression model (with time splines):

direction_mdl <- brm(bf(vertical_red ~ 1 +
                          s(year) +
                          (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(direction_mdl, file = '../models/direction_mdl.RData')

Load model:

# load('../models/direction_mdl.RData')

Check model:

# direction_mdl

Show posterior predictive simulations:

# pp_check(direction_mdl, ndraws = 100)

Extract conditional effects for plotting:

# direction_mdl_df <- conditional_effects(direction_mdl)$year

Make a plot of the curve:

# # Plot core:
# 
# direction_p <- direction_mdl_df |> 
#   ggplot(aes(x = year, y = estimate__,
#              ymin = lower__, ymax = upper__)) +
#   geom_ribbon(fill = 'grey', alpha = 0.7) +
#   geom_line(col = 'purple', size = 1.25)
# 
# # Axes and labels:
# 
# direction_p <- direction_p +
#   scale_x_continuous(breaks = seq(1960, 2020, 10)) +
#   scale_y_continuous(limits = c(0, 1),
#                      expand = c(0, 0)) +
#   xlab('Year') +
#   ylab('Probability of vertical writing')
# 
# # Show and save:
# 
# direction_p
# ggsave('../figures/pdf/writing_direction.pdf', direction_p,
#        width = 5.8, height = 3.7)
# ggsave('../figures/png/writing_direction.png', direction_p,
#        width = 5.8, height = 3.7)

6.3 Ideophones and sound objects

6.3.1 Ideophones

What is the breakdown of counts, like, how many ads have 1, 2, or 3 ideophones etc.?

soju |> 
  adorn_percentages(ideophone_word_count) |> 
  print(n = Inf)
## # A tibble: 5 × 3
##   ideophone_word_count     n p    
##                  <dbl> <int> <chr>
## 1                   NA   749 94%  
## 2                    1    39 5%   
## 3                    2     6 1%   
## 4                    3     2 0%   
## 5                    5     1 0%

NA’s indicate ads without ideophones.

Easier to just look at the binary variable:

soju |> 
  adorn_percentages(ideophone_red)
## # A tibble: 2 × 3
##   ideophone_red     n p    
##   <chr>         <int> <chr>
## 1 no              749 94%  
## 2 yes              48 6%

As elswehere, we’ll use the yes/no variable for looking at time trends.

What are the ideophones? Collapse categories that only differ in punctuation for this, but we need to be careful that one cell contains ‘지끈지끈, 울렁울렁’, which we’ll want to have on separate lines.

ideophone_tab <- soju |> 
  mutate(ideophone_clean = str_replace_all(ideophone,
                                           '[^가-힣]',
                                           '')) |>
  count(ideophone_clean) |> 
  filter(!is.na(ideophone_clean)) |> 
  mutate(ideophone_clean = if_else(ideophone_clean == '지끈지끈울렁울렁',
                             '지끈지끈', ideophone_clean)) |>   
  bind_rows(tibble(ideophone_clean = '울렁울렁',
            n = 1)) |> 
  arrange(desc(n)) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         p = p * 100,
         p = str_c(p, '%'))

# Show and save:

ideophone_tab
## # A tibble: 28 × 3
##    ideophone_clean     n p    
##    <chr>           <dbl> <chr>
##  1 딱                 11 23%  
##  2 술술                4 8%   
##  3 짠                  4 8%   
##  4 깨끗                2 4%   
##  5 봄봄봄봄            2 4%   
##  6 쏘옥                2 4%   
##  7 확                  2 4%   
##  8 그린그린            1 2%   
##  9 달달                1 2%   
## 10 돌아돌아            1 2%   
## # ℹ 18 more rows
write_excel_csv(ideophone_tab, '../summary_tables/ideophone_table.csv')

Look at the time trend for the reduced variable hanja_red (Yes/No) as a proportion and a stacked bar plot:

# Plot core:

ideophone_bar_p <- soju |>
  count(year_binned, ideophone_red) |>
  group_by(year_binned) |>
  mutate(prop = n / sum(n)) |>
  ggplot(aes(x = year_binned, y = prop, fill = ideophone_red)) +
  geom_col(col = 'black')

# Axes and labels:

ideophone_bar_p <- ideophone_bar_p +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Show:

ideophone_bar_p

For reporting, here is proportion of ads with ideophones counted by decade (year_binned):

soju |> 
  count(year_binned, ideophone_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(ideophone_red != 'no')
## # A tibble: 7 × 5
## # Groups:   year_binned [7]
##   year_binned ideophone_red     n     p percentage
##         <dbl> <chr>         <int> <dbl> <chr>     
## 1        1960 yes               1  0.03 3%        
## 2        1970 yes               1  0.03 3%        
## 3        1980 yes               1  0.06 6%        
## 4        1990 yes               5  0.03 3%        
## 5        2000 yes               9  0.07 7%        
## 6        2010 yes              23  0.07 7%        
## 7        2020 yes               8  0.09 9%

Build a model of p(y = has ideophone), which will be a logistic regression model:

Bodo action point: Need to recompile with even higher adapt_delta and possibly better priors to get rid of divergent transitions.

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               ideophone_red = factor(ideophone_red, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

ideophone_mdl <- brm(bf(ideophone_red ~ 1 +
                        s(year) +
                        (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(ideophone_mdl, file = '../models/ideophone_mdl.RData')

Load:

load('../models/ideophone_mdl.RData')

Show posterior predictive simulations:

pp_check(ideophone_mdl, ndraws = 100)

pp_check(ideophone_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

ideophone_mdl
## Warning: There were 13 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: ideophone_red ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     1.31      1.28     0.04     4.78 1.00     3223     3122
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.78      0.27     0.36     1.42 1.00     3584     4582
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -2.91      0.31    -3.57    -2.35 1.00     3761     4560
## syear_1       3.34      4.12    -4.35    13.28 1.00     3205     1909
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

ideophone_mdl_df <- conditional_effects(ideophone_mdl)$year

Make a plot of the curve:

# Plot core:

ideophone_p <- ideophone_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

ideophone_p <- ideophone_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 0.35),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Proportion of ideophones')

# Show and save:

ideophone_p

ggsave('../figures/pdf/ideophone.pdf', ideophone_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/ideophone.png', ideophone_p,
       width = 5.8, height = 3.7)

6.3.2 Sound objects

What is the breakdown of counts, like, how many ads have 1, 2, or 3 sound objects etc.?

soju |> 
  adorn_percentages(sound_object_word_count) |> 
  print(n = Inf)
## # A tibble: 5 × 3
##   sound_object_word_count     n p    
##                     <dbl> <int> <chr>
## 1                      NA   765 96%  
## 2                       1    28 4%   
## 3                       2     2 0%   
## 4                       3     1 0%   
## 5                       4     1 0%

NA’s indicate ads without sound objects.

Easier to just look at the binary variable:

soju |> 
  adorn_percentages(sound_object_red)
## # A tibble: 2 × 3
##   sound_object_red     n p    
##   <chr>            <int> <chr>
## 1 no                 765 96%  
## 2 yes                 32 4%

As elswehere, we’ll use the yes/no variable for looking at time trends.

What are the sound objects? We need to make sure that ‘ㅇㅋ~, 앗싸~’ and ‘야야야, 차차차’ are on separate rows:

sound_object_tab <- soju |> 
  mutate(sound_object = if_else(sound_object == 'ㅇㅋ~, 앗싸~',
                                '앗싸~', sound_object),
         sound_object = if_else(sound_object == '야야야, 차차차',
                                '차차차', sound_object)) |> 
  count(sound_object) |> 
  bind_rows(tibble(sound_object = c('ㅇㅋ~', '야야야'),
                   n = c(1, 1))) |> 
  filter(!is.na(sound_object)) |>
  arrange(desc(n),
          desc(sound_object))|> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         p = p * 100,
         p = str_c(p, '%'))

# Show and save:

sound_object_tab
## # A tibble: 13 × 3
##    sound_object     n p    
##    <chr>        <dbl> <chr>
##  1 캬              10 29%  
##  2 예               9 26%  
##  3 어               3 9%   
##  4 앗싸~            3 9%   
##  5 크으             1 3%   
##  6 차차차           1 3%   
##  7 자아             1 3%   
##  8 이이잉           1 3%   
##  9 음               1 3%   
## 10 우~웅            1 3%   
## 11 오매             1 3%   
## 12 야야야           1 3%   
## 13 ㅇㅋ~            1 3%
write_excel_csv(sound_object_tab, '../summary_tables/sound_object_table.csv')

Look at the time trend for the reduced variable hanja_red (Yes/No) as a proportion and a stacked bar plot:

# Plot core:

sound_object_bar_p <- soju |>
  count(year_binned, sound_object_red) |>
  group_by(year_binned) |>
  mutate(prop = n / sum(n)) |>
  ggplot(aes(x = year_binned, y = prop, fill = sound_object_red)) +
  geom_col(col = 'black')

# Axes and labels:

sound_object_bar_p <- sound_object_bar_p +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Show:

sound_object_bar_p

For reporting, here is proportion of ads with ideophones counted by decade (year_binned):

soju |> 
  count(year_binned, sound_object_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(sound_object_red != 'no')
## # A tibble: 4 × 5
## # Groups:   year_binned [4]
##   year_binned sound_object_red     n     p percentage
##         <dbl> <chr>            <int> <dbl> <chr>     
## 1        1990 yes                  6  0.04 4%        
## 2        2000 yes                 14  0.11 11%       
## 3        2010 yes                 11  0.03 3%        
## 4        2020 yes                  1  0.01 1%

Build a model of p(y = has sound object), which will be a logistic regression model:

Bodo action point: Need to recompile with even higher adapt_delta and possibly better priors to get rid of divergent transitions.

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               sound_object_red = factor(sound_object_red, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

sound_object_mdl <- brm(bf(sound_object_red ~ 1 +
                        s(year) +
                        (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(sound_object_mdl, file = '../models/sound_object_mdl.RData')

Load:

load('../models/sound_object_mdl.RData')

Show posterior predictive simulations:

pp_check(sound_object_mdl, ndraws = 100)

pp_check(sound_object_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

sound_object_mdl
## Warning: There were 2 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: sound_object_red ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     6.95      4.32     2.02    17.80 1.00     4818     4546
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     2.33      1.01     0.95     4.84 1.00     4417     5256
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -5.81      1.40    -9.20    -3.79 1.00     4446     4469
## syear_1       1.07     19.13   -34.51    45.49 1.00     3921     3364
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

sound_object_mdl_df <- conditional_effects(sound_object_mdl)$year

Make a plot of the curve:

# Plot core:

sound_object_p <- sound_object_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

sound_object_p <- sound_object_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 0.35),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Proportion of sound objects')

# Show and save:

sound_object_p

ggsave('../figures/pdf/sound_object.pdf', ideophone_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/sound_object.png', ideophone_p,
       width = 5.8, height = 3.7)

6.3.3 Ideophones and sound objects together

What is the connection between ideophones and sound objects?

soju |> 
  adorn_percentages(ideophone_red, sound_object_red)
## # A tibble: 3 × 4
##   ideophone_red sound_object_red     n p    
##   <chr>         <chr>            <int> <chr>
## 1 no            no                 717 90%  
## 2 yes           no                  48 6%   
## 3 no            yes                 32 4%

Ideophones and sound objects never occur in the same ad together. Makes sense, for one because they’re both infrequent, so the baseline chance of this happening is low, but also because they probably fulfill similar function and are thus in complementary distribution.

6.3.4 Double plot

# Change titles:

ideophone_p <- ideophone_p + ggtitle('a) Ideophones')
sound_object_p <- sound_object_p + ggtitle('b) Sound objects')

# Change y-axes:

ideophone_p <- ideophone_p + ylab('Probability') +
  theme(plot.title = element_text(face = 'bold'))
sound_object_p <- sound_object_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))

# Merge:

double_p <- ideophone_p + sound_object_p

# Show and save:

double_p

ggsave(plot = double_p, filename = '../figures/pdf/ideophone_sound_object.pdf',
       width = 9.5, height = 3.5)
ggsave(plot = double_p, filename = '../figures/png/ideophone_sound_object.png',
       width = 9.5, height = 3.5)

6.4 Politeness indexicals

6.4.1 Slogan ending

Check the slogan endings:

soju |> 
  filter(!is.na(main_slogan_ending_red)) |> 
  adorn_percentages(main_slogan_ending_red)
## # A tibble: 4 × 3
##   main_slogan_ending_red     n p    
##   <chr>                  <int> <chr>
## 1 Verb                     386 49%  
## 2 Noun                     364 46%  
## 3 English                   25 3%   
## 4 Nonverbal speech sound    20 3%

Let’s check this over binned time:

soju |> 
  mutate(year_binned = factor(year_binned)) |> 
  count(year_binned, main_slogan_ending_red) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = main_slogan_ending_red)) +
  geom_col(col = 'black') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')

There’s a rise and fall of verbs over time. And nonverbal speech sounds are unattested for the earlier years. Let’s compute the descriptive percentages:

soju |> 
  count(year_binned, main_slogan_ending_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%')) |> 
  filter(main_slogan_ending_red == 'Noun')
## # A tibble: 7 × 5
## # Groups:   year_binned [7]
##   year_binned main_slogan_ending_red     n     p percentage
##         <dbl> <chr>                  <int> <dbl> <chr>     
## 1        1960 Noun                      28  0.78 78%       
## 2        1970 Noun                      24  0.77 77%       
## 3        1980 Noun                      11  0.69 69%       
## 4        1990 Noun                      82  0.53 53%       
## 5        2000 Noun                      32  0.24 24%       
## 6        2010 Noun                     131  0.39 39%       
## 7        2020 Noun                      56  0.62 62%

Build a model of verb ending, p(y = has noun), where 'Verb' is the reference level:

# Create a subset of only those that have nouns or verbs and then factor code this, setting verb as reference level:

soju_ending_red <- soju |> 
  filter(main_slogan_ending_red %in% c('Verb', 'Noun')) |> 
  mutate(main_slogan_ending_red = factor(main_slogan_ending_red,
                                         levels = c('Verb', 'Noun')))

# Generalized additive logistic regression model (with time splines):

slogan_end_mdl <- brm(bf(main_slogan_ending_red ~ 1 +
                         s(year) +
                         (1|company)),
                 data = soju_ending_red,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(slogan_end_mdl, file = '../models/slogan_end_mdl.RData')

Load:

load('../models/slogan_end_mdl.RData')

Show posterior predictive simulations:

pp_check(slogan_end_mdl, ndraws = 100)

pp_check(slogan_end_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

slogan_end_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: main_slogan_ending_red ~ 1 + s(year) + (1 | company) 
##    Data: soju_ending_red (Number of observations: 750) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     4.42      1.64     2.17     8.51 1.00     3790     4780
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.23      0.17     0.01     0.63 1.00     2331     3078
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.04      0.13    -0.26     0.24 1.00     4226     2642
## syear_1      10.08      6.65    -2.37    24.08 1.00     4212     4653
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

slogan_end_mdl_df <- conditional_effects(slogan_end_mdl)$year

Make a plot of the curve, first noun endings:

# Plot core:

slogan_end_p <- slogan_end_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

slogan_end_p <- slogan_end_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of ending in a noun')

# Show and save:

slogan_end_p

ggsave('../figures/png/slogan_end.png', slogan_end_p,
       width = 5.8, height = 3.7)
ggsave('../figures/pdf/slogan_end.pdf', slogan_end_p,
       width = 5.8, height = 3.7)

6.4.2 Secondary slogan ending

Let’s do the same for secondary_slogan_ending:

soju |> 
  count(secondary_slogan_ending, sort = TRUE)
## # A tibble: 7 × 2
##   secondary_slogan_ending     n
##   <chr>                   <int>
## 1 Noun                      391
## 2 Verb                      227
## 3 <NA>                      146
## 4 Adjective (English)        11
## 5 Noun and particle          11
## 6 Noun (English)              8
## 7 Nonverbal speech sound      3

Rinse and repeat:

soju |> 
  filter(!is.na(secondary_slogan_ending_red)) |>
  count(year_binned, secondary_slogan_ending_red) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = secondary_slogan_ending_red)) +
  geom_col(col = 'black') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

I’m not going to look at noun_token and verb_token and english_token in detail as that would be way too many categories.

6.4.3 Slogan politeness

soju |> 
  filter(!is.na(verb_token)) |> 
  adorn_percentages(verb_token)
## # A tibble: 6 × 3
##   verb_token     n p    
##   <chr>      <int> <chr>
## 1 ~니다        111 47%  
## 2 반말          88 37%  
## 3 ~요           33 14%  
## 4 게             2 1%   
## 5 술술           2 1%   
## 6 ~세요          1 0%

Let’s look at slogan_end_verb:

slogan_verb_counts <- soju |> 
  filter(!is.na(slogan_end_verb_red)) |> 
  adorn_percentages(slogan_end_verb_red)

# Show:

slogan_verb_counts
## # A tibble: 2 × 3
##   slogan_end_verb_red     n p    
##   <chr>               <int> <chr>
## 1 panmal                233 62%  
## 2 contaymal             145 38%

Plot this over time for the three most frequent cases, for which we should first get the Korean out of that count table for the first four rows, and then exclude the NA:

keep_these <- soju |> 
  adorn_percentages(slogan_end_verb) |> 
  slice_head(n = 4) |> 
  filter(!is.na(slogan_end_verb)) |> 
  pull(slogan_end_verb)

# Show:

keep_these
## [1] "반말"  "~요"   "~니다"

Make a plot of this:

soju |> 
  filter(slogan_end_verb %in% keep_these) |>
  count(year_binned, slogan_end_verb) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n),
         slogan_end_verb = factor(slogan_end_verb, levels = keep_these)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = slogan_end_verb)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('steelblue', 'goldenrod3', 'purple'),
                    name = FALSE) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The Korean text doesn’t print properly, but the purple one that declines over time is the -nida form (less politeness over time in ads?), the blue one that rises massively is panmal, then -yo is in the middle, kinda constant.

Especially if we treat “-yo” as also being more polite, in contrast to panmal, then this would be a relatively clear trend towards more panmal / informal speech in the last two decades.

Let’s make a plot of just the reduced contaymal versus panmal variable slogan_end_verb_red:

soju |> 
  count(year_binned, slogan_end_verb_red) |> 
  filter(!is.na(slogan_end_verb_red)) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = slogan_end_verb_red)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('steelblue', 'goldenrod3'),
                    name = FALSE) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Clear rise of panmal (informal style) over time. What are the descriptive percentages for that?

soju |> 
  count(year_binned, slogan_end_verb_red) |> 
  filter(!is.na(slogan_end_verb_red)) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         p = round(p, 2),
         percentage = str_c(p * 100, '%'))
## # A tibble: 14 × 5
## # Groups:   year_binned [7]
##    year_binned slogan_end_verb_red     n     p percentage
##          <dbl> <chr>               <int> <dbl> <chr>     
##  1        1960 contaymal               7  0.88 88%       
##  2        1960 panmal                  1  0.12 12%       
##  3        1970 contaymal               6  0.86 86%       
##  4        1970 panmal                  1  0.14 14%       
##  5        1980 contaymal               4  0.8  80%       
##  6        1980 panmal                  1  0.2  20%       
##  7        1990 contaymal              33  0.46 46%       
##  8        1990 panmal                 39  0.54 54%       
##  9        2000 contaymal              39  0.44 44%       
## 10        2000 panmal                 49  0.56 56%       
## 11        2010 contaymal              47  0.27 27%       
## 12        2010 panmal                125  0.73 73%       
## 13        2020 contaymal               9  0.35 35%       
## 14        2020 panmal                 17  0.65 65%

Make a model of this, modeling p(y = has panmal), which will be a logistic regression model:

# Factor-code the variable with desired order of levels:

soju <- mutate(soju,
               slogan_end_verb_red = factor(slogan_end_verb_red,
                                            levels = c('contaymal', 'panmal')))

# Generalized additive logistic regression model (with time splines):

panmal_mdl <- brm(bf(slogan_end_verb_red ~ 1 +
                       s(year) +
                       (1|company)),
                 data = filter(soju, !is.na(slogan_end_verb_red)),
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(panmal_mdl, file = '../models/panmal_mdl.RData')

Bodo action point: 4 divergent transitions to kill.

Load:

load('../models/panmal_mdl.RData')

Show posterior predictive simulations:

pp_check(panmal_mdl, ndraws = 100)

pp_check(panmal_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

panmal_mdl
## Warning: There were 4 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: slogan_end_verb_red ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, !is.na(slogan_end_verb_red)) (Number of observations: 378) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     1.75      1.61     0.05     5.98 1.00     2438     3426
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 16) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.02      0.42     0.40     2.05 1.00     2309     3720
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     0.46      0.37    -0.36     1.14 1.00     2431     3158
## syear_1       5.03      4.80    -7.12    13.29 1.00     2806     1870
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

panmal_mdl_df <- conditional_effects(panmal_mdl)$year

Make a plot of the curve:

# Plot core:

panmal_p <- panmal_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

panmal_p <- panmal_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of panmal')

# Show and save:

panmal_p

ggsave('../figures/pdf/panmal.pdf', panmal_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/panmal.png', panmal_p,
       width = 5.8, height = 3.7)

6.4.4 Sound objects

Check:

soju |> 
  adorn_percentages(main_slogan_ending)
## # A tibble: 7 × 3
##   main_slogan_ending         n p    
##   <chr>                  <int> <chr>
## 1 Verb                     386 48%  
## 2 Noun                     364 46%  
## 3 Nonverbal speech sound    20 3%   
## 4 Noun (English)            15 2%   
## 5 Adjective (English)        9 1%   
## 6 <NA>                       2 0%   
## 7 Adverb (English)           1 0%

Check when all ads were published that featured nonverbal speech sounds:

soju |> 
  filter(main_slogan_ending == 'Nonverbal speech sound') |> 
  select(year) |> 
  adorn_percentages(year) |> 
  arrange(year)
## # A tibble: 7 × 3
##    year     n p    
##   <dbl> <int> <chr>
## 1  2001     2 10%  
## 2  2008     5 25%  
## 3  2015     4 20%  
## 4  2016     4 20%  
## 5  2017     3 15%  
## 6  2018     1 5%   
## 7  2021     1 5%

Check the tokens:

soju |> 
  filter(!is.na(slogan_end_nonverbal)) |> 
  adorn_percentages(slogan_end_nonverbal)
## # A tibble: 9 × 3
##   slogan_end_nonverbal     n p    
##   <chr>                <int> <chr>
## 1 카~                      5 26%  
## 2 콜                       4 21%  
## 3 짠                       3 16%  
## 4 쏘~옥                    2 11%  
## 5 BAAM                     1 5%   
## 6 Chu                      1 5%   
## 7 우~웅                    1 5%   
## 8 이이잉                   1 5%   
## 9 크으                     1 5%

6.5 Bottle features

6.5.1 Bottle presence

Let’s look at the bottle_presence variable:

soju |> 
  adorn_percentages(bottle_presence)
## # A tibble: 4 × 3
##   bottle_presence     n p    
##   <chr>           <int> <chr>
## 1 Superimposed      451 57%  
## 2 Yes               324 41%  
## 3 No                 16 2%   
## 4 Drawing             6 1%

Plot this over time:

soju |> 
  count(year_binned, bottle_presence) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = bottle_presence)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Clearly more superimposed type later… would make sense though since this probably scales with image editing technology in terms of software, which has become better over time, making this easier to do for advertising and marketing firms.

Check the simplified variable:

soju |> 
  adorn_percentages(has_bottle)
## # A tibble: 2 × 3
##   has_bottle     n p    
##   <chr>      <int> <chr>
## 1 yes          781 98%  
## 2 no            16 2%

The 16 ads without a bottle, from when are they?

soju |>
  filter(has_bottle == 'no') |>
  select(id, company, brand, year)
## # A tibble: 16 × 4
##    id                          company        brand                         year
##    <chr>                       <chr>          <chr>                        <dbl>
##  1 vocal_utterance_word_number GeumGok 금곡   Geumgok 금곡                  1968
##  2 3                           DaeJeon 대전   Chungseong 충성               1963
##  3 21                          SeoGwang 서광  Jinro 진로                    1960
##  4 33                          CheonMa 천마   Cheonma 천마                  1961
##  5 34                          CheongRo 청로  Cheongro 청로                 1963
##  6 49                          BaekGwang 백광 Baekgwang Milkamju 백광밀감…  1971
##  7 52                          BaekHwa 백화   Baekhwa 백화                  1974
##  8 55                          BoBae 보배     Bobae 보배                    1972
##  9 58                          JoHae 조해     Johae 조해                    1979
## 10 59                          JinRo 진로     Jinro 진로                    1970
## 11 60                          JinRo 진로     Jinro 진로                    1970
## 12 478                         Lotte 롯데     Chumchurum Cool 처음처럼 쿨   2012
## 13 479                         Lotte 롯데     Chumchurum Cool 처음처럼 쿨   2012
## 14 480                         Lotte 롯데     Chumchurum Cool 처음처럼 쿨   2012
## 15 481                         Lotte 롯데     Chumchurum Cool 처음처럼 쿨   2012
## 16 539                         BoHae 보해     Yipsejoo 잎새주               2011

Count the decades for this:

soju |>
  filter(has_bottle == 'no') |>
  count(year_binned)
## # A tibble: 3 × 2
##   year_binned     n
##         <dbl> <int>
## 1        1960     5
## 2        1970     6
## 3        2010     5

Quantify decade:

soju |> 
  count(year_binned, has_bottle) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         percentage = round(p, 2) * 100,
         percentage = str_c(percentage, '%'))
## # A tibble: 10 × 5
## # Groups:   year_binned [7]
##    year_binned has_bottle     n      p percentage
##          <dbl> <chr>      <int>  <dbl> <chr>     
##  1        1960 no             5 0.139  14%       
##  2        1960 yes           31 0.861  86%       
##  3        1970 no             6 0.194  19%       
##  4        1970 yes           25 0.806  81%       
##  5        1980 yes           16 1      100%      
##  6        1990 yes          156 1      100%      
##  7        2000 yes          132 1      100%      
##  8        2010 no             5 0.0149 1%        
##  9        2010 yes          330 0.985  99%       
## 10        2020 yes           91 1      100%

For reporting, report differences for those where the bottle is present:

soju |> 
  filter(has_bottle == 'yes') |> 
  adorn_percentages(bottle_presence)
## # A tibble: 3 × 3
##   bottle_presence     n p    
##   <chr>           <int> <chr>
## 1 Superimposed      451 58%  
## 2 Yes               324 41%  
## 3 Drawing             6 1%

Make a model of bottle presence over time, using has_bottle.

Build a model of p(y = has bottle), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               has_bottle = factor(has_bottle, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

bottle_presence_mdl <- brm(bf(has_bottle ~ 1 +
                                s(year) +
                                (1|company)),
                           data = soju,
                           family = bernoulli,
                 
                           # MCMC settings:
                   
                           cores = 4, seed = 42,
                           chains = 4, iter = 6000, warmup = 4000,
                           control = list(adapt_delta = 0.99,
                                          max_treedepth = 12))

# Save model:

save(bottle_presence_mdl, file = '../models/bottle_presence_mdl.RData')

Bodo action point: 16 divergent transitions (!!) need to be eliminated.

Load:

load('../models/bottle_presence_mdl.RData')

Show posterior predictive simulations:

pp_check(bottle_presence_mdl, ndraws = 100)

pp_check(bottle_presence_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

bottle_presence_mdl
## Warning: There were 16 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: has_bottle ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     3.77      3.78     0.15    14.29 1.00     1831     1548
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     2.18      0.89     0.91     4.30 1.00     2317     2486
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     5.60      0.99     4.00     7.87 1.00     3543     3478
## syear_1      16.28     13.31     0.55    53.58 1.00     1984     1159
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

bottle_presence_mdl_df <- conditional_effects(bottle_presence_mdl)$year

Make a plot of the curve:

# Plot core:

bottle_presence_p <- bottle_presence_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

bottle_presence_p <- bottle_presence_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of bottle in ad')

# Show and save:

bottle_presence_p

ggsave('../figures/pdf/bottle_presence.pdf', bottle_presence_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/bottle_presence.png', bottle_presence_p,
       width = 5.8, height = 3.7)

6.5.2 Bottle position

Another noteworthy pattern is that there’s more ads without any bottle in the past. So, over time, people have emphasized the bottle more! Certainly here there is less of a trivial technological explanation for that, in that people for sure could’ve made photos of the bottle in the past. So this probably reflects a more strategic decision.

soju |> 
  adorn_percentages(bottle_position)
## # A tibble: 16 × 3
##    bottle_position     n p    
##    <chr>           <int> <chr>
##  1 Bottom right      329 41%  
##  2 Bottom left       101 13%  
##  3 Bottom centre      61 8%   
##  4 Right              57 7%   
##  5 Centre             48 6%   
##  6 Left               40 5%   
##  7 Centre right       37 5%   
##  8 Center center      32 4%   
##  9 Centre left        25 3%   
## 10 Top right          20 3%   
## 11 <NA>               16 2%   
## 12 Top centre         14 2%   
## 13 Top left           10 1%   
## 14 Bottom              3 0%   
## 15 Cetnre left         3 0%   
## 16 Top                 1 0%

Some cleaning to do here, including typos. Let’s simplify the centre stuff and at the same time get rid of the typo:

soju <- mutate(soju,
               bottle_position = str_replace_all(bottle_position,
                                                 '(entre)|(etnre)',
                                                 'enter'))

Ok, some issues here. The bottle_presence variable has == Yes for one case that then has no bottle position. How can a bottle be present that has no position?

The bottle_position variable is different from the logo_location variable, in that the bottle can of course be larger or smaller, and then be positioned differently — it’s not as easily locatable into horizontal and vertical quadrants. For that reason, we will just look at the most common categories again, but leaving these categories unchanged for now, to see whether there are any noteworthy trends over time.

# Vector of first six from table of counts:

bottle_positions <- soju |> 
  count(bottle_position, sort = TRUE) |> 
  slice_head(n = 6) |> 
  pull(bottle_position)

# Plot:

soju |> 
  filter(bottle_position %in% bottle_positions) |> 
  mutate(bottle_position = factor(bottle_position, levels = bottle_positions)) |> 
  count(year_binned, bottle_position) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = bottle_position)) +
  geom_col(col = 'black') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The bottle has surely moved more towards the bottom and more towards the right. It’s definitely in less central positions as before. This goes against my idea that the bottle has risen in importance over time. It suggests more that if the bottle was present in the past, it was also in a more central position.

We may want to think about what this means in terms of scanning directions for images, and whether given what we know about this, people will notice the bottle first or last. Perhaps there’s some literature on scanning direction in images or even specifically ads (eye-tracking studies?), and then also literature on placement of stuff inside ads?

6.5.3 Bottle color

Bottle color is next:

soju |> 
  filter(!is.na(bottle_color)) |> 
  adorn_percentages(bottle_color)
## # A tibble: 6 × 3
##   bottle_color       n p    
##   <chr>          <int> <chr>
## 1 Green            637 82%  
## 2 Clear            101 13%  
## 3 Brown or black    30 4%   
## 4 Blue               3 0%   
## 5 White              2 0%   
## 6 Cannot tell        1 0%

Then finally, isn’t and clear a separate piece of information? I suspect that some of the bottles coded as Green are also sometimes clear? (Can we check?) Either way, it would be good if this variable is only hue, and not a mix of hue and transparency, which are two separate variables.

Let’s look at temporal trends:

soju |> 
  count(year_binned, bottle_color) |> 
  filter(!is.na(bottle_color),
         bottle_color != 'Cannot tell') |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = bottle_color)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('blue', 'brown', 'lightblue',
                               'green', 'white'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL)

A definite shift towards green, and no more black or browns in later years, as well as fewer clears.

Let’s get some nice numbers to report on these trends:

soju |> 
  filter(bottle_color %in% c('Brown or black', 'Green', 'Clear')) |> 
  count(year_binned, bottle_color) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         percentage = round(p, 2) * 100,
         percentage = str_c(percentage, '%'))
## # A tibble: 15 × 5
## # Groups:   year_binned [7]
##    year_binned bottle_color       n      p percentage
##          <dbl> <chr>          <int>  <dbl> <chr>     
##  1        1960 Brown or black    11 0.355  35%       
##  2        1960 Clear             20 0.645  65%       
##  3        1970 Brown or black     8 0.333  33%       
##  4        1970 Clear             16 0.667  67%       
##  5        1980 Brown or black     5 0.333  33%       
##  6        1980 Clear             10 0.667  67%       
##  7        1990 Brown or black     6 0.04   4%        
##  8        1990 Clear             21 0.14   14%       
##  9        1990 Green            123 0.82   82%       
## 10        2000 Clear              3 0.0227 2%        
## 11        2000 Green            129 0.977  98%       
## 12        2010 Clear             15 0.0455 5%        
## 13        2010 Green            315 0.955  95%       
## 14        2020 Clear             16 0.186  19%       
## 15        2020 Green             70 0.814  81%

Make a model of this, green bottles over time, modeling p(y = has green bottle), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               has_green_bottle = factor(has_green_bottle, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

green_bottle_mdl <- brm(bf(has_green_bottle ~ 1 +
                             s(year) +
                             (1|company)),
                        data = filter(soju, bottle_presence != 'No',
                                      overall_color == 'Color'),
                        family = bernoulli,
                 
                        # MCMC settings:
                   
                        cores = 4, seed = 42,
                        chains = 4, iter = 6000, warmup = 4000,
                        control = list(adapt_delta = 0.99,
                                       max_treedepth = 12))

# Save model:

save(green_bottle_mdl, file = '../models/green_bottle_mdl.RData')

Load:

load('../models/green_bottle_mdl.RData')

Show posterior predictive simulations:

pp_check(green_bottle_mdl, ndraws = 100)

pp_check(green_bottle_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

green_bottle_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: has_green_bottle ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, bottle_presence != "No", overall_colo (Number of observations: 697) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)    12.76      5.96     5.64    27.71 1.00     3668     3964
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 14) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     2.25      1.10     0.75     4.90 1.00     1968     2800
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     2.86      0.80     1.06     4.31 1.00     2034     2844
## syear_1      12.98     30.94   -42.76    82.91 1.00     3262     3149
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

green_bottle_mdl_df <- conditional_effects(green_bottle_mdl)$year

Make a plot of the curve:

# Plot core:

green_bottle_p <- green_bottle_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

green_bottle_p <- green_bottle_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of green bottle')

# Show and save:

green_bottle_p

ggsave('../figures/pdf/green_bottle.pdf', green_bottle_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/green_bottle.png', green_bottle_p,
       width = 5.8, height = 3.7)

Next, clear bottles over time, modeling p(y = has clear bottle), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               has_clear_bottle = factor(has_clear_bottle, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

clear_bottle_mdl <- brm(bf(has_clear_bottle ~ 1 +
                             s(year) +
                             (1|company)),
                        data = filter(soju, bottle_presence != 'No',
                                      overall_color == 'Color'),
                        family = bernoulli,
                 
                        # MCMC settings:
                   
                        cores = 4, seed = 42,
                        chains = 4, iter = 6000, warmup = 4000,
                        control = list(adapt_delta = 0.99,
                                       max_treedepth = 12))

# Save model:

save(clear_bottle_mdl, file = '../models/clear_bottle_mdl.RData')

Load:

load('../models/clear_bottle_mdl.RData')

Show posterior predictive simulations:

pp_check(clear_bottle_mdl, ndraws = 100)

pp_check(clear_bottle_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

clear_bottle_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: has_clear_bottle ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, bottle_presence != "No", overall_colo (Number of observations: 697) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)    11.40      5.18     5.01    24.36 1.00     3339     4129
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 14) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     3.10      1.31     1.30     6.37 1.00     2483     4459
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -3.35      1.02    -5.39    -1.26 1.00     2493     3157
## syear_1      19.98     16.92   -11.90    54.75 1.00     4945     5267
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

clear_bottle_mdl_df <- conditional_effects(clear_bottle_mdl)$year

Make a plot of the curve:

# Plot core:

clear_bottle_p <- clear_bottle_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

clear_bottle_p <- clear_bottle_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of clear bottle')

# Show and save:

clear_bottle_p

ggsave('../figures/pdf/clear_bottle.pdf', clear_bottle_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/clear_bottle.png', clear_bottle_p,
       width = 5.8, height = 3.7)

Same for has brown or black bottle variable has_black_or_brown_bottle, modeling p(y = has brown or black bottle), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               has_black_or_brown_bottle = factor(has_black_or_brown_bottle,
                                                  levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

brown_bottle_mdl <- brm(bf(has_black_or_brown_bottle ~ 1 +
                             s(year) +
                             (1|company)),
                        data = filter(soju, bottle_presence != 'No',
                                      overall_color == 'Color'),
                        family = bernoulli,
                 
                        # MCMC settings:
                   
                        cores = 4, seed = 42,
                        chains = 4, iter = 6000, warmup = 4000,
                        control = list(adapt_delta = 0.99,
                                       max_treedepth = 12))

# Save model:

save(brown_bottle_mdl, file = '../models/brown_bottle_mdl.RData')

Load:

load('../models/brown_bottle_mdl.RData')

Show posterior predictive simulations:

pp_check(brown_bottle_mdl, ndraws = 100)

pp_check(brown_bottle_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

brown_bottle_mdl
## Warning: There were 1 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: has_black_or_brown_bottle ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, bottle_presence != "No", overall_colo (Number of observations: 697) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     2.40      2.26     0.08     8.34 1.00     3914     3852
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 14) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.12      0.92     0.05     3.42 1.00     3719     3442
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -7.49      1.70   -11.65    -5.12 1.00     3535     1987
## syear_1     -15.58      9.08   -34.89     2.34 1.00     3171     1685
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

brown_bottle_mdl_df <- conditional_effects(brown_bottle_mdl)$year

Make a plot of the curve:

# Plot core:

brown_bottle_p <- brown_bottle_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

brown_bottle_p <- brown_bottle_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of black or brown bottle')

# Show and save:

brown_bottle_p

ggsave('../figures/pdf/black_or_brown_bottle.pdf', brown_bottle_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/black_or_brown_bottle.png', brown_bottle_p,
       width = 5.8, height = 3.7)

Make a triple plot of the three main color categories:

# Change titles:

green_bottle_p <- green_bottle_p + ggtitle('a) Green bottles')
clear_bottle_p <- clear_bottle_p + ggtitle('b) Clear bottles')
brown_bottle_p <- brown_bottle_p + ggtitle('c) Black or brown bottles')

# Change y-axes:

green_bottle_p <- green_bottle_p + ylab('Probability') +
  theme(plot.title = element_text(face = 'bold'))
clear_bottle_p <- clear_bottle_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))
brown_bottle_p <- brown_bottle_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))

# Merge:

triple_p <- green_bottle_p + clear_bottle_p + brown_bottle_p

# Show and save:

triple_p

ggsave(plot = triple_p, filename = '../figures/pdfbottle_color_triple.pdf',
       width = 15, height = 4.5)

6.6 Bottle shape

What about bottle shape?

soju |> 
  count(bottle_shape, sort = TRUE) |> 
  print(n = Inf)
## # A tibble: 34 × 2
##    bottle_shape                                                           n
##    <chr>                                                              <int>
##  1 long neck                                                            523
##  2 short neck                                                            92
##  3 long thick neck                                                       38
##  4 <NA>                                                                  29
##  5 long neck and convex shoulder                                         22
##  6 long neck and long narrow body                                        15
##  7 short slanted neck and long narrow body                               15
##  8 long and thick neck                                                    7
##  9 short neck and long rectangular body                                   7
## 10 long neck and concave shoulder                                         5
## 11 short neck and long square body                                        4
## 12 short neck, long neck and long narrow body                             4
## 13 extra long neck and short narrow body                                  3
## 14 flask                                                                  3
## 15 short neck and long curved body                                        3
## 16 long neck and long angular narrow body                                 2
## 17 long neck and long narrow body, short neck and wide trapezoid body     2
## 18 long neck and short round body, long neck and short narrow body        2
## 19 long neck and wide round body, short wide neck and long round body     2
## 20 short neck and long round body                                         2
## 21 short neck and wide rectangular body                                   2
## 22 short neck long rectangular body and long neck                         2
## 23 straight neck and convex shoulder                                      2
## 24 long neck and short rectangular body                                   1
## 25 long neck and wide body                                                1
## 26 long neck with patterned glass                                         1
## 27 long straight neck                                                     1
## 28 rectangle                                                              1
## 29 short and thick neck, short neck and long rectangular body             1
## 30 short and wide neck                                                    1
## 31 short and wide neck and body                                           1
## 32 short neck and long round body, short neck and long square body        1
## 33 short neck square body, and long neck round bottom                     1
## 34 short neck square body, long neck round bottom                         1

Let’s look at the most dominant categories then:

# Extract vector:

these_shapes <- soju |> 
  filter(!is.na(bottle_shape)) |> 
  count(bottle_shape, sort = TRUE) |> 
  slice_head(n = 6) |> 
  pull(bottle_shape)

# Plot basics:

shape_p <- soju |> 
  filter(bottle_shape %in% these_shapes) |> 
  count(year_binned, bottle_shape) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = bottle_shape)) +
  geom_col(col = 'black',
           width = 6)

# Scales, axes and cosmetics:

shape_p <- shape_p +
  scale_x_continuous(breaks = seq(1960, 2020, by = 10),
                     labels = seq(1960, 2020, by = 10)) +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL,
                    direction = -1) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(#axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.x = element_text(face = 'bold', size = 12),
        legend.position = 'bottom')

# Show and save:

shape_p

ggsave(plot = shape_p, filename = '../figures/pdf/bottle_shape_barplot.pdf',
       width = 7.5, height = 5.5)
ggsave(plot = shape_p, filename = '../figures/png/bottle_shape_barplot.png',
       width = 7.5, height = 5.5)

Very clear pattern where the long neck ones win over time, and the short neck ones clearly die out. The reduced bottle_shape_red variable may be a better indicator here:

soju |> 
  filter(!is.na(bottle_shape_red)) |> 
  adorn_percentages(bottle_shape_red)
## # A tibble: 2 × 3
##   bottle_shape_red     n p    
##   <chr>            <int> <chr>
## 1 long neck          621 82%  
## 2 short neck         133 18%

Overall more long necked ones. Let’s look at this over time:

soju |> 
  filter(!is.na(bottle_shape_red)) |> 
  count(year_binned, bottle_shape_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         percentage = round(p, 2) * 100,
         percentage = str_c(percentage, '%'))
## # A tibble: 14 × 5
## # Groups:   year_binned [7]
##    year_binned bottle_shape_red     n      p percentage
##          <dbl> <chr>            <int>  <dbl> <chr>     
##  1        1960 long neck           11 0.55   55%       
##  2        1960 short neck           9 0.45   45%       
##  3        1970 long neck            5 0.263  26%       
##  4        1970 short neck          14 0.737  74%       
##  5        1980 long neck            3 0.25   25%       
##  6        1980 short neck           9 0.75   75%       
##  7        1990 long neck           84 0.545  55%       
##  8        1990 short neck          70 0.455  45%       
##  9        2000 long neck          128 0.970  97%       
## 10        2000 short neck           4 0.0303 3%        
## 11        2010 long neck          317 0.972  97%       
## 12        2010 short neck           9 0.0276 3%        
## 13        2020 long neck           73 0.802  80%       
## 14        2020 short neck          18 0.198  20%

Make a model of this, modeling long over short necks:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               bottle_shape_red = factor(bottle_shape_red,
                                         levels = c('short neck', 'long neck')))

# Generalized additive logistic regression model (with time splines):

shape_mdl <- brm(bf(bottle_shape_red ~ 1 +
                      s(year) +
                      (1|company)),
                 data = filter(soju, !is.na(bottle_shape_red)),
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(shape_mdl, file = '../models/shape_mdl.RData')

Load:

load('../models/shape_mdl.RData')

Show posterior predictive simulations:

pp_check(shape_mdl, ndraws = 100)

pp_check(shape_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

shape_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: bottle_shape_red ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, !is.na(bottle_shape_red)) (Number of observations: 754) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)    15.44      6.75     6.77    32.73 1.00     2470     4487
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 26) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     3.32      0.98     1.86     5.74 1.00     2152     4041
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     2.11      0.87     0.30     3.72 1.00     1777     2892
## syear_1     -51.53     17.61   -90.54   -20.84 1.00     3279     3579
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

shape_mdl_df <- conditional_effects(shape_mdl)$year

Make a plot of the curve:

# Plot core:

shape_p <- shape_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

shape_p <- shape_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of long neck')

# Show and save:

shape_p

ggsave('../figures/pdf/shape.pdf', shape_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/shape.png', shape_p,
       width = 5.8, height = 3.7)

6.7 Bottle size

Let’s check bottle_size:

soju |> 
  count(bottle_size, sort = TRUE)
## # A tibble: 10 × 2
##    bottle_size                n
##    <chr>                  <int>
##  1 Lifesize                 646
##  2 Large                    109
##  3 <NA>                      22
##  4 Both                       9
##  5 Lifesize and large         4
##  6 Large/Lifesize             2
##  7 Lifesize/Large             2
##  8 Actual                     1
##  9 Lifesize & Large           1
## 10 Lifesize and miniature     1

Check the NA cases again in relation to bottle_presence:

soju |> 
  count(bottle_presence, bottle_size)
## # A tibble: 17 × 3
##    bottle_presence bottle_size                n
##    <chr>           <chr>                  <int>
##  1 Drawing         Large                      1
##  2 Drawing         Lifesize                   5
##  3 No              <NA>                      16
##  4 Superimposed    Both                       9
##  5 Superimposed    Large                     61
##  6 Superimposed    Lifesize                 377
##  7 Superimposed    Lifesize and large         2
##  8 Superimposed    Lifesize and miniature     1
##  9 Superimposed    <NA>                       1
## 10 Yes             Actual                     1
## 11 Yes             Large                     47
## 12 Yes             Large/Lifesize             2
## 13 Yes             Lifesize                 264
## 14 Yes             Lifesize & Large           1
## 15 Yes             Lifesize and large         2
## 16 Yes             Lifesize/Large             2
## 17 Yes             <NA>                       5

Ok, what are these?

soju |> 
  filter(bottle_presence != 'No',
         is.na(bottle_size)) |> 
  select(id, year, company, brand, bottle_presence, bottle_size)
## # A tibble: 6 × 6
##   id     year company     brand                      bottle_presence bottle_size
##   <chr> <dbl> <chr>       <chr>                      <chr>           <chr>      
## 1 45     1978 DaeSun 대선 Daesun 대선                Superimposed    <NA>       
## 2 61     1970 JinRo 진로  Jinro 진로                 Yes             <NA>       
## 3 65     1974 JinRo 진로  Jinro 진로                 Yes             <NA>       
## 4 82     1989 JinRo 진로  Jinro 진로                 Yes             <NA>       
## 5 184    1990 JinRo 진로  Jinro 진로                 Yes             <NA>       
## 6 201    1996 JinRo 진로  Barrel Aged Premium Soju … Yes             <NA>

Action point 23: Check these cases, where the bottle is indicated to be present on bottle_presence but doesn’t have a size.

6.8 Bottle continuous data analysis

Yet to be done…

soju |> 
  ggplot(aes(x = year, y = neck_ratio)) +
  geom_point()
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).

soju |> 
  ggplot(aes(x = year, y = body_ratio)) +
  geom_point()
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_point()`).

soju |> 
  ggplot(aes(x = year, y = neck_over_body_ratio)) +
  geom_point()
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

These are all super similar. Let’s plot them against alcohol_content instead of time.

soju |> 
  ggplot(aes(x = alcohol_content, y = neck_ratio)) +
  geom_point()
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

soju |> 
  ggplot(aes(x = alcohol_content, y = neck_ratio)) +
  geom_point()
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

6.9 Logo location

Show logo location counts:

logo_location_counts <- soju |> 
  adorn_percentages(logo_location) |> 
  print()
## # A tibble: 11 × 3
##    logo_location      n p    
##    <chr>          <int> <chr>
##  1 no logo          259 32%  
##  2 top left         255 32%  
##  3 top right        102 13%  
##  4 bottom right      75 9%   
##  5 bottom left       32 4%   
##  6 bottom center     20 3%   
##  7 multiple logos    19 2%   
##  8 top center        13 2%   
##  9 center            10 1%   
## 10 center right       6 1%   
## 11 on bottle          6 1%

What’s the average year for each? This way we can see whether there’s a temporal trend for logo location.

soju |> 
  group_by(logo_location) |> 
  summarize(year = mean(year, na.rm = TRUE)) |> 
  right_join(logo_location_counts) |> 
  arrange(desc(n))
## Joining with `by = join_by(logo_location)`
## # A tibble: 11 × 4
##    logo_location   year     n p    
##    <chr>          <dbl> <int> <chr>
##  1 no logo        2009.   259 32%  
##  2 top left       2011.   255 32%  
##  3 top right      2007.   102 13%  
##  4 bottom right   1997.    75 9%   
##  5 bottom left    1999.    32 4%   
##  6 bottom center  1979.    20 3%   
##  7 multiple logos 2003.    19 2%   
##  8 top center     1996.    13 2%   
##  9 center         1970.    10 1%   
## 10 center right   1983      6 1%   
## 11 on bottle      2006.     6 1%

Check that this works:

soju |> 
  filter(!is.na(logo_vertical)) |> 
  adorn_percentages(logo_vertical)
## # A tibble: 3 × 3
##   logo_vertical     n p    
##   <chr>         <int> <chr>
## 1 top             372 72%  
## 2 bottom          128 25%  
## 3 center           17 3%
soju |> 
  filter(!is.na(logo_horizontal)) |> 
  adorn_percentages(logo_horizontal)
## # A tibble: 3 × 3
##   logo_horizontal     n p    
##   <chr>           <int> <chr>
## 1 left              287 54%  
## 2 right             197 37%  
## 3 center             43 8%

Compute average year for these to get a first-hand feel for temporal shifts:

soju |> 
  group_by(logo_vertical) |> 
  summarize(M = mean(year))
## # A tibble: 4 × 2
##   logo_vertical     M
##   <chr>         <dbl>
## 1 bottom        1995.
## 2 center        1975.
## 3 top           2009.
## 4 <NA>          2009.
soju |> 
  group_by(logo_horizontal) |> 
  summarize(M = mean(year))
## # A tibble: 4 × 2
##   logo_horizontal     M
##   <chr>           <dbl>
## 1 center          1982.
## 2 left            2010.
## 3 right           2002.
## 4 <NA>            2009.

Check for temporal trends then. First for logo_vertical:

soju |> 
  count(year_binned, logo_vertical) |>
  filter(!is.na(logo_vertical)) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ungroup() |> 
  ggplot(aes(x = year_binned, y = prop, group = logo_vertical)) +
  geom_line(aes(color = logo_vertical)) +
  xlab(NULL) +
  ylab('Proportion') +
  scale_y_continuous(limits = c(0, 1))

Quite a fluctuation, but the logo seems to have moved from more bottom position to more top position.

The goal was to model this using a multinomial model, but the problem is that the center position is not attested beyond 1990, which messes up our ability to make any inferences for that (it’s also quite low in number anyway). So we’ll resort to modeling just the difference between top and center position.

logo_vertical_mdl <- brm(bf(logo_vertical ~ 1 +
                              s(year) +
                              (1|company)),
                         data = filter(soju, logo_vertical != 'center') |> 
                           mutate(logo_vertical = factor(logo_vertical)),
                         family = bernoulli,
                         
                         # MCMC settings:
                   
                         cores = 4, seed = 42,
                         control = list(adapt_delta = 0.96),
                         chains = 4, iter = 4000, warmup = 3000)

# Save model:

save(logo_vertical_mdl, file = '../models/logo_vertical_mdl.RData')

Load:

load('../models/logo_vertical_mdl.RData')

Show conditional effects of logo_vertical_mdl:

conditional_effects(logo_vertical_mdl)

Let’s do the same for logo_horizontal:

soju |> 
  count(year_binned, logo_horizontal) |>
  filter(!is.na(logo_horizontal)) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ungroup() |> 
  ggplot(aes(x = year_binned, y = prop, group = logo_horizontal)) +
  geom_line(aes(color = logo_horizontal)) +
  xlab(NULL) +
  ylab('Proportion') +
  scale_y_continuous(limits = c(0, 1))

Definitely a clear trend where the logo has moved from center position towards a more left position in the majority of cases.

Let’s model this using a multinomial model:

logo_horizontal_mdl <- brm(bf(logo_horizontal ~ 1 +
                                s(year) +
                                (1|company)),
                           data = mutate(soju,
                                         logo_horizontal = factor(logo_horizontal)),
                           family = multinomial,
                           
                           # MCMC settings:
                   
                           cores = 4, seed = 42,
                           control = list(adapt_delta = 0.96),
                           chains = 4, iter = 4000, warmup = 3000)

# Save model:

save(logo_horizontal_mdl, file = '../models/logo_horizontal_mdl.RData')

Load:

# load('../models/logo_horizontal_mdl.RData')

Bodo action point: Model does not even start — is this because of the regression splines, and that for each time point something needs to be attested?

6.10 Logo modality and type

Check logo_modality:

soju |> 
  filter(!is.na(logo_modality)) |> 
  adorn_percentages(logo_modality)
## # A tibble: 3 × 3
##   logo_modality      n p    
##   <chr>          <int> <chr>
## 1 Word and image   370 69%  
## 2 Word             151 28%  
## 3 Image             17 3%

Check multimodal_logo_type:

soju |>
  filter(!is.na(multimodal_logo_type)) |> 
  adorn_percentages(multimodal_logo_type)
## # A tibble: 2 × 3
##   multimodal_logo_type     n p    
##   <chr>                <int> <chr>
## 1 Separate               278 78%  
## 2 Merged                  79 22%

6.11 Font color

Next, let’s look at the writing_color variable, but for only those that are not complete black and white. We need to extract NAs which are the two ads for which there is no writing (id == 535 and id == 539).

soju |> 
  filter(overall_color == 'Color',
         !is.na(writing_color)) |> 
  adorn_percentages(writing_color) |> 
  print(n = Inf)
## # A tibble: 124 × 3
##     writing_color                     n p    
##     <chr>                         <int> <chr>
##   1 White                           109 15%  
##   2 Black                            51 7%   
##   3 Blue                             46 7%   
##   4 Green and black                  43 6%   
##   5 Blue and white                   42 6%   
##   6 Green                            40 6%   
##   7 Black and white                  27 4%   
##   8 Green, black and white           20 3%   
##   9 Green and white                  18 3%   
##  10 Yellow and white                 18 3%   
##  11 Blue and black                   17 2%   
##  12 Brown and black                  11 2%   
##  13 Pink and blue                    11 2%   
##  14 Blue and green                    9 1%   
##  15 Pink and white                    9 1%   
##  16 Black and blue                    8 1%   
##  17 Pink                              8 1%   
##  18 Red and blue                      8 1%   
##  19 Red                               7 1%   
##  20 White and green                   7 1%   
##  21 Black, red and blue               6 1%   
##  22 Blue, green and white             6 1%   
##  23 Red, blue and white               6 1%   
##  24 Blue and grey                     5 1%   
##  25 Brown                             5 1%   
##  26 Green and red                     5 1%   
##  27 White and yellow                  5 1%   
##  28 Black and green                   4 1%   
##  29 Brown and white                   4 1%   
##  30 Green, black and blue             4 1%   
##  31 Green, black and red              4 1%   
##  32 Multicolour                       4 1%   
##  33 Red and green                     4 1%   
##  34 Red, blue and green               4 1%   
##  35 Black and red                     3 0%   
##  36 Blue, yellow and white            3 0%   
##  37 Green and blue                    3 0%   
##  38 Green and pink                    3 0%   
##  39 White, black and green            3 0%   
##  40 White, blue and black             3 0%   
##  41 Yellow, blue and white            3 0%   
##  42 Yellow, green and white           3 0%   
##  43 Black and pink                    2 0%   
##  44 Black, blue and green             2 0%   
##  45 Blue, black and white             2 0%   
##  46 Blue, green and black             2 0%   
##  47 Gold and silver                   2 0%   
##  48 Gold and white                    2 0%   
##  49 Gold, silver and white            2 0%   
##  50 Green, black and orange           2 0%   
##  51 Green, black and yellow           2 0%   
##  52 Green, black, red and yellow      2 0%   
##  53 Grey                              2 0%   
##  54 Multicoloured                     2 0%   
##  55 Navy blue and baby blue           2 0%   
##  56 Orange, green and black           2 0%   
##  57 Pink, blue and black              2 0%   
##  58 Pink, yellow, blue and white      2 0%   
##  59 Purple and white                  2 0%   
##  60 Red, blue and black               2 0%   
##  61 Red, yellow, blue and white       2 0%   
##  62 Silver                            2 0%   
##  63 White and blue                    2 0%   
##  64 Yellow                            2 0%   
##  65 Yellow and black                  2 0%   
##  66 Yellow, black and white           2 0%   
##  67 Aqua and white                    1 0%   
##  68 Black and purple                  1 0%   
##  69 Black, blue, red                  1 0%   
##  70 Black, red, blue and yellow       1 0%   
##  71 Black, white, red and blue        1 0%   
##  72 Blue and purple                   1 0%   
##  73 Blue and red                      1 0%   
##  74 Blue and yellow                   1 0%   
##  75 Blue, black and grey              1 0%   
##  76 Blue, black and pink              1 0%   
##  77 Blue, grey and white              1 0%   
##  78 Blue, red and white               1 0%   
##  79 Brown and blue                    1 0%   
##  80 Gold and pink                     1 0%   
##  81 Gold, silver and green            1 0%   
##  82 Green and orange                  1 0%   
##  83 Green, black and grey             1 0%   
##  84 Green, black, blue and yellow     1 0%   
##  85 Green, blue and white             1 0%   
##  86 Green, orange black and blue      1 0%   
##  87 Green, yellow and white           1 0%   
##  88 Grey and red                      1 0%   
##  89 Navy and red                      1 0%   
##  90 Orange                            1 0%   
##  91 Orange and blue                   1 0%   
##  92 Orange and brown                  1 0%   
##  93 Orange and white                  1 0%   
##  94 Orange, blue and green            1 0%   
##  95 Orange, blue, green and black     1 0%   
##  96 Orange, green and white           1 0%   
##  97 Pink and black                    1 0%   
##  98 Pink and grey                     1 0%   
##  99 Pink, blue and white              1 0%   
## 100 Pink, blue, green and white       1 0%   
## 101 Pink, green and white             1 0%   
## 102 Pink, greey, blue and white       1 0%   
## 103 Pink, yellow and white            1 0%   
## 104 Purple and blue                   1 0%   
## 105 Purple and pink                   1 0%   
## 106 Red and black                     1 0%   
## 107 Red and pink                      1 0%   
## 108 Red and white                     1 0%   
## 109 Red, black and white              1 0%   
## 110 Red, green and black              1 0%   
## 111 Red, green and white              1 0%   
## 112 Teal and white                    1 0%   
## 113 White and black                   1 0%   
## 114 White and grey                    1 0%   
## 115 White and red                     1 0%   
## 116 White, black and blue             1 0%   
## 117 White, blue and green             1 0%   
## 118 White, blue and pink              1 0%   
## 119 White, blue and yellow            1 0%   
## 120 White, green, gold                1 0%   
## 121 White, red and blue               1 0%   
## 122 Yellow and green                  1 0%   
## 123 Yellow and red                    1 0%   
## 124 Yellow, blue, black and white     1 0%

What’s with the NAs here? What are they?

soju |> 
  filter(is.na(writing_color)) |> 
  select(id, company, brand, writing_color)
## # A tibble: 2 × 4
##   id    company    brand           writing_color
##   <chr> <chr>      <chr>           <chr>        
## 1 535   BoHae 보해 Yipsejoo 잎새주 <NA>         
## 2 539   BoHae 보해 Yipsejoo 잎새주 <NA>

Let’s look at the first 6, which is all the way up to green, across years. We also want to have control over the order of factor levels so that we can display things in a sensible order. So we’ll create a vector with the desired ordered first, and then use that to convert the character vector writing_color into a factor vector, but only for the plot (not saved in the tibble).

# Define vector with pre-specified order of levels:

color_levels <- c('Black', 'White', 'Blue', 'Green',
                  'Green and black', 'Blue and white')

# Plot with these levels:

soju |> 
  filter(overall_color == 'Color',
         writing_color %in% c('White', 'Black', 'Blue',
                              'Green and black', 'Blue and white',
                              'Green')) |> 
  mutate(writing_color = factor(writing_color, levels = color_levels)) |> 
  count(year_binned, writing_color) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = writing_color)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('black', 'white', 'blue',
                               'green', 'darkgreen', 'lightblue'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL)

It looks like there’s more blue in later years. Also more white in after 2005 than before, and certainly less black. More light blue for sure, but green seems fairly stable in comparison. The only green fronts seem to be a post-1995 invention though.

Another way of looking at this is to use the any_green variables created above that also captures the fonts that have some mix with green, and the same for any_blue.

# Check:

soju |>
  count(any_green)
## # A tibble: 3 × 2
##   any_green     n
##   <chr>     <int>
## 1 has green   209
## 2 no green    586
## 3 <NA>          2
soju |>
  count(any_blue)
## # A tibble: 3 × 2
##   any_blue     n
##   <chr>    <int>
## 1 has blue   229
## 2 no blue    566
## 3 <NA>         2
# Sanity check a few instances:

soju |> filter(any_green == 'has green') |> 
  sample_n(10) |> select(writing_color)
## # A tibble: 10 × 1
##    writing_color          
##    <chr>                  
##  1 Green and red          
##  2 Green                  
##  3 Blue and green         
##  4 Green, black and white 
##  5 Green                  
##  6 Green                  
##  7 Green, black and yellow
##  8 Green and white        
##  9 Green, black and white 
## 10 Green, black and blue
soju |> filter(any_blue == 'has blue') |> 
  sample_n(10) |> select(writing_color)
## # A tibble: 10 × 1
##    writing_color                
##    <chr>                        
##  1 Blue                         
##  2 Orange, blue, green and black
##  3 Blue, black and white        
##  4 Green, black and blue        
##  5 Red, blue and white          
##  6 Blue and grey                
##  7 White, blue and pink         
##  8 Pink and blue                
##  9 White, blue and yellow       
## 10 Red, blue and green

Ok, seems to have worked out. Let’s plot this now over time:

soju |> 
  filter(!is.na(any_green),
         overall_color == 'Color') |> 
  count(year_binned, any_green) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = any_green)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('darkgreen', 'darkgrey'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL)

Green onset in the 80’s, but then it got a bit trite?

Let’s look at any_blue in comparison:

soju |> 
  filter(!is.na(any_blue),
         overall_color == 'Color') |> 
  count(year_binned, any_blue) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = any_blue)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('blue', 'darkgrey'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL)

Oh wow, that’s a bit unexpected IMO. More blue for the pre-1985 ones? Obviously these are very few in number because there’s less color ads for those years, so to be taken with a grain of salt, but let’s check these cases perhaps?

soju |> 
  filter(year <= 1985,
         overall_color == 'Color',
         any_blue == 'has blue') |> 
  select(id, year, company, brand)
## # A tibble: 5 × 4
##   id     year company      brand       
##   <chr> <dbl> <chr>        <chr>       
## 1 54     1976 BaekHwa 백화 Baekhwa 백화
## 2 62     1970 JinRo 진로   Jinro 진로  
## 3 63     1971 JinRo 진로   Jinro 진로  
## 4 67     1975 JinRo 진로   Jinro 진로  
## 5 73     1985 BoHae 보해   Bohae 보해

Check those cases perhaps? What is blue used for here? And, so, yeah, there’s actually an overall trend of more blue since 1995, since those are much bigger numbers.

Build a model of p(y = has green font), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               any_green = factor(any_green, levels = c('no green', 'has green')))

# Generalized additive logistic regression model (with time splines):

font_green_mdl <- brm(bf(any_green ~ 1 +
                        s(year) +
                        (1|company)),
                 data = filter(soju, overall_color == 'Color'),
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(font_green_mdl, file = '../models/font_green_mdl.RData')

Load:

load('../models/font_green_mdl.RData')

Show posterior predictive simulations:

pp_check(font_green_mdl, ndraws = 100)

pp_check(font_green_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

font_green_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: any_green ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, overall_color == "Color") (Number of observations: 705) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     5.24      2.23     2.41    10.89 1.00     4399     4972
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 14) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.07      0.35     0.56     1.91 1.00     2553     4021
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -1.02      0.37    -1.77    -0.28 1.00     1841     3216
## syear_1       1.64     11.92   -19.54    28.40 1.00     3601     3641
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

font_green_mdl_df <- conditional_effects(font_green_mdl)$year

Make a plot of the curve:

# Plot core:

font_green_p <- font_green_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

font_green_p <- font_green_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of green font')

# Show and save:

font_green_p

ggsave('../figures/pdf/font_green.pdf', font_green_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/font_green.png', font_green_p,
       width = 5.8, height = 3.7)

Build a model of p(y = has blue font), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               any_blue = factor(any_blue, levels = c('no blue', 'has blue')))

# Generalized additive logistic regression model (with time splines):

font_blue_mdl <- brm(bf(any_blue ~ 1 +
                        s(year) +
                        (1|company)),
                 data = filter(soju, overall_color == 'Color'),
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(font_blue_mdl, file = '../models/font_blue_mdl.RData')

Load:

load('../models/font_blue_mdl.RData')

Show posterior predictive simulations:

pp_check(font_blue_mdl, ndraws = 100)

pp_check(font_blue_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

font_blue_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: any_blue ~ 1 + s(year) + (1 | company) 
##    Data: filter(soju, overall_color == "Color") (Number of observations: 705) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     6.53      2.75     2.98    13.73 1.00     4781     4525
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 14) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.43      0.48     0.76     2.61 1.00     2109     4513
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.73      0.46    -1.63     0.21 1.00     2320     3567
## syear_1     -12.29     14.80   -45.39    12.31 1.00     3564     3334
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

font_blue_mdl_df <- conditional_effects(font_blue_mdl)$year

Make a plot of the curve:

# Plot core:

font_blue_p <- font_blue_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

font_blue_p <- font_blue_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of blue font')

# Show and save:

font_blue_p

ggsave('../figures/pdf/font_blue.pdf', font_blue_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/font_blue.png', font_blue_p,
       width = 5.8, height = 3.7)

Put both together into a double plot:

# Change titles:

font_green_p <- font_green_p + ggtitle('a) Green font')
font_blue_p <- font_blue_p + ggtitle('b) Blue font')

# Change y-axes:

font_green_p <- font_green_p + ylab('Probability') +
  theme(plot.title = element_text(face = 'bold'))
font_blue_p <- font_blue_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))

# Merge:

font_colors_p <- font_green_p + font_blue_p

# Show and save:

font_colors_p

ggsave(plot = font_colors_p, filename = '../figures/pdfboth_font_colors_green.pdf',
       width = 9.5, height = 4.5)

6.12 Font style

Let’s look at the text types:

soju |> 
  filter(!is.na(font_style)) |> 
  adorn_percentages(font_style)
## # A tibble: 4 × 3
##   font_style                n p    
##   <chr>                 <int> <chr>
## 1 Print                   610 77%  
## 2 Calligraphy              81 10%  
## 3 Print and calligraphy    65 8%   
## 4 Calligraphy and print    39 5%

Reduced format:

soju |> 
  filter(!is.na(font_style)) |> 
  adorn_percentages(font_style_red)
## # A tibble: 2 × 3
##   font_style_red     n p    
##   <chr>          <int> <chr>
## 1 Print            675 85%  
## 2 Calligraphy      120 15%

Check this over the years — decided to go with the un-simplified variable for now just to check that there’s something going on.

soju |> 
  filter(!is.na(font_style)) |> # to be replaced or kept after NAs are checked
  count(year_binned, font_style) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = font_style)) +
  geom_col(col = 'black') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Print dominates throughout, but there’s a clear downwards trend for the Print and calligraphy type, and a bit of a resurgence of Calligraphy and Calligraphy and print for the last two to three decades. I wonder whether this pattern is a similar “return to oldschool” that we’ve seen with the hanja. Let’s see how these variables behave with respect to each other:

soju |> 
  filter(!is.na(hanja_red),
         !is.na(font_style)) |> 
  count(hanja_red, font_style) |> 
  group_by(hanja_red) |> 
  mutate(prop = n / sum(n))
## # A tibble: 8 × 4
## # Groups:   hanja_red [2]
##   hanja_red font_style                n   prop
##   <chr>     <chr>                 <int>  <dbl>
## 1 no        Calligraphy              70 0.128 
## 2 no        Calligraphy and print    33 0.0601
## 3 no        Print                   420 0.765 
## 4 no        Print and calligraphy    26 0.0474
## 5 yes       Calligraphy              11 0.0447
## 6 yes       Calligraphy and print     6 0.0244
## 7 yes       Print                   190 0.772 
## 8 yes       Print and calligraphy    39 0.159

So, there is a bit of association between those that have hanja and Print and calligraphy, which is proportionally much more frequent for this (15%) than for the no-Hanja cases (5%). The rest is not that different.

Maybe it does make sense to keep Print and calligraphy and Calligraphy and print separate then, but I’m just not sure that this distinction could be captured with high inter-rater reliability?

Let’s look at font weight.

soju |> 
  filter(!is.na(font_weight)) |> 
  adorn_percentages(font_weight)
## # A tibble: 2 × 3
##   font_weight     n p    
##   <chr>       <int> <chr>
## 1 Bold          417 52%  
## 2 Light         378 48%

Check this over time:

soju |> 
  filter(!is.na(font_weight)) |> # to be replaced or kept after NAs are checked
  count(year_binned, font_weight) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = font_weight)) +
  geom_col(col = 'black') +
  scale_fill_brewer(palette = 'Spectral',
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interesting. Quite a jump of Bold for the most modern ones, after otherwise it’s been a slight trend towards more Light font. Could this be the same pattern of a resurgence of “bold” oldschoolness?

I worry now that some of this stuff may be an artifact of the binning procedure. Perhaps it would make sense to look at this continuously, and we may want to take more fine-grained bins later then… but the issue with that would be that for some of the sparser years, results could then be increasingly deceiving (and things will look more jittery) because the proportion could be 100% for a given small dataset, but only due to a few cases, or even just a single one.

Anyway, let’s try the continuous year measure and the bold/light contrast:

soju |> 
  filter(!is.na(font_weight)) |> 
  count(year, font_weight) |> 
  group_by(year) |> 
  mutate(prop = n / sum(n)) |> 
  ungroup() |> 
  ggplot(aes(x = year, y = prop, col = font_weight)) +
  geom_line() +
  scale_color_manual(values = c('grey', 'blue'),
                     name = NULL) +
  scale_x_continuous(limits = c(1960, 2020),
                     breaks = seq(1960, 2020, 5)) +
  xlab(NULL) +
  ylab('Proportion')
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

A bit of a weird plot — we could only just show one line since they are each other’s converses due to two mutually exclusive binary categories — but shows that the trend towards more bold in the last few years is definitely a thing. And there’s a clear pattern with ads from 1995 to 2005 having more light font.

Make a model for font weight, p(y = has bold font), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               font_weight = factor(font_weight, levels = c('Light', 'Bold')))

# Generalized additive logistic regression model (with time splines):

weight_mdl <- brm(bf(font_weight ~ 1 +
                       s(year) +
                       (1|company)),
                 data = soju,
                 family = bernoulli,
                 
                 # MCMC settings:
                   
                 cores = 4, seed = 42,
                 chains = 4, iter = 6000, warmup = 4000,
                 control = list(adapt_delta = 0.99,
                                max_treedepth = 12))

# Save model:

save(weight_mdl, file = '../models/weight_mdl.RData')

Load:

load('../models/weight_mdl.RData')

Show posterior predictive simulations:

pp_check(weight_mdl, ndraws = 100)

pp_check(weight_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

weight_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: font_weight ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 795) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     4.84      2.20     2.03    10.46 1.00     3228     4641
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.60      0.25     0.20     1.19 1.00     1888     1980
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.03      0.22    -0.53     0.35 1.00     3148     3503
## syear_1       7.56      6.91    -6.76    21.46 1.00     6215     4572
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

weight_mdl_df <- conditional_effects(weight_mdl)$year

Make a plot of the curve:

# Plot core:

weight_p <- weight_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

weight_p <- weight_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of boldface font')

# Show and save:

weight_p

ggsave('../figures/pdf/font_weight.pdf', weight_p,
       width = 5.8, height = 3.7)

6.13 Model features

6.13.1 Presence of model

Check whether there is a model or not:

soju |> 
  adorn_percentages(has_model)
## # A tibble: 2 × 3
##   has_model     n p    
##   <chr>     <int> <chr>
## 1 yes         522 65%  
## 2 no          275 35%

65% have a model, 35% do not.

See whether this changes across age, first using descriptive statistics:

soju |> 
  count(year_binned, has_model) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 13 × 5
## # Groups:   year_binned [7]
##    year_binned has_model     n     p perc 
##          <dbl> <chr>     <int> <dbl> <chr>
##  1        1960 no           36 1     100% 
##  2        1970 no           23 0.742 74.2%
##  3        1970 yes           8 0.258 25.8%
##  4        1980 no           11 0.688 68.8%
##  5        1980 yes           5 0.312 31.2%
##  6        1990 no          104 0.667 66.7%
##  7        1990 yes          52 0.333 33.3%
##  8        2000 no           15 0.114 11.4%
##  9        2000 yes         117 0.886 88.6%
## 10        2010 no           52 0.155 15.5%
## 11        2010 yes         283 0.845 84.5%
## 12        2020 no           34 0.374 37.4%
## 13        2020 yes          57 0.626 62.6%

Look at this time trend in a stacked bar plot:

soju |> 
  count(year_binned, has_model) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = has_model)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Makes it quite clear that there are more models over time, except slightly fewer in very recent years (post-2020).

Model whether there is any model over time:

Build a model of p(y = has model), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               has_model = factor(has_model, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

has_model_mdl <- brm(bf(has_model ~ 1 +
                          s(year) +
                          (1|company)),
                     data = soju,
                     family = bernoulli,
                     
                     # MCMC settings:
                   
                     cores = 4, seed = 42,
                     chains = 4, iter = 6000, warmup = 4000,
                     control = list(adapt_delta = 0.99,
                                    max_treedepth = 12))

# Save model:

save(has_model_mdl, file = '../models/has_model_mdl.RData')

Bodo action point: One divergent transition we need to get rid of!

Load:

load('../models/has_model_mdl.RData')

Check posterior predictive checks:

pp_check(has_model_mdl, ndraws = 100)

pp_check(has_model_mdl, ndraws = 100, type = 'ecdf_overlay')

Looks good to me.

has_model_mdl_df <- conditional_effects(has_model_mdl)$year

Make a plot of the curve:

# Plot core:

model_p <- has_model_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

model_p <- model_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of female model')

# Show and save:

model_p

ggsave('../figures/pdf/has_model_model.pdf', model_p,
       width = 5.8, height = 3.7)

6.13.2 Model gender: focus on females

Check the model_gender variable:

soju |> 
  adorn_percentages(model_gender_red)
## # A tibble: 6 × 3
##   model_gender_red     n p    
##   <chr>            <int> <chr>
## 1 female solo        372 47%  
## 2 no model           275 35%  
## 3 male solo           67 8%   
## 4 mixed group         48 6%   
## 5 female group        22 3%   
## 6 male group          13 2%

Female solo is by far the most common overall, but remember that we also have many more newer ads where these are more common.

Look at this again, but using the subset of only those that have a solo model in it:

soju |> 
  filter(str_detect(model_gender_red, 'solo')) |> 
  adorn_percentages(model_gender_red)
## # A tibble: 2 × 3
##   model_gender_red     n p    
##   <chr>            <int> <chr>
## 1 female solo        372 85%  
## 2 male solo           67 15%

85% of those that have a solo character in it have a female character.

Let’s look at this over time:

soju |> 
  filter(str_detect(model_gender_red, 'solo')) |> 
  count(year_binned, has_female) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 11 × 5
## # Groups:   year_binned [6]
##    year_binned has_female     n      p perc 
##          <dbl> <chr>      <int>  <dbl> <chr>
##  1        1970 no             3 0.75   75%  
##  2        1970 yes            1 0.25   25%  
##  3        1980 no             2 1      100% 
##  4        1990 no            27 0.692  69.2%
##  5        1990 yes           12 0.308  30.8%
##  6        2000 no             7 0.0642 6.4% 
##  7        2000 yes          102 0.936  93.6%
##  8        2010 no            23 0.101  10.1%
##  9        2010 yes          205 0.899  89.9%
## 10        2020 no             5 0.0877 8.8% 
## 11        2020 yes           52 0.912  91.2%

Compute average age per model gender category:

soju |> 
  group_by(model_gender_red) |> 
  summarize(M = mean(year))
## # A tibble: 6 × 2
##   model_gender_red     M
##   <chr>            <dbl>
## 1 female group     2017.
## 2 female solo      2013.
## 3 male group       1995.
## 4 male solo        2004.
## 5 mixed group      2009.
## 6 no model         1997.

The temporal trend in stacked bar plot, for the has_female model variable:

soju |> 
  filter(has_model == 'yes') |>  # only take those that have a model
  count(year_binned, has_female) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = has_female)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Model of female over time:

Build a model of p(y = has female), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               has_female = factor(has_female, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

female_mdl <- brm(bf(has_female ~ 1 +
                       s(year) +
                       (1|company)),
                  data = soju,
                  family = bernoulli,
                  
                  # MCMC settings:
                   
                  cores = 4, seed = 42,
                  chains = 4, iter = 6000, warmup = 4000,
                  control = list(adapt_delta = 0.99,
                                 max_treedepth = 12))

# Save model:

save(female_mdl, file = '../models/female_mdl.RData')

Bodo action point: One divergent transition we need to get rid of!

Load:

load('../models/female_mdl.RData')

Check posterior predictive checks:

pp_check(female_mdl, ndraws = 100)

pp_check(female_mdl, ndraws = 100, type = 'ecdf_overlay')

Looks good to me.

female_mdl_df <- conditional_effects(female_mdl)$year

Make a plot of the curve:

# Plot core:

female_p <- female_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

female_p <- female_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of female model')

# Show and save:

female_p

ggsave('../figures/pdf/female_model.pdf', female_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/female_model.png', female_p,
       width = 5.8, height = 3.7)

6.13.3 Model gender: focus on males

Look at the time trend for the reduced variables has_model, has_female, has_group, has_male, and has_mixed:

# Has male:

soju |> 
  count(year_binned, has_male) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = has_male)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Has group:

soju |> 
  count(year_binned, has_group) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = has_group)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Has mixed:

soju |> 
  count(year_binned, has_mixed) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = has_mixed)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Model of male over time:

Build a model of p(y = has male), which will be a logistic regression model:

# Factor-code the hanja variable:

soju <- mutate(soju,
               has_male = factor(has_male, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

male_mdl <- brm(bf(has_male ~ 1 +
                     s(year) +
                     (1|company)),
                data = soju,
                family = bernoulli,
                
                # MCMC settings:
                   
                cores = 4, seed = 42,
                chains = 4, iter = 6000, warmup = 4000,
                control = list(adapt_delta = 0.99,
                               max_treedepth = 12))

# Save model:

save(male_mdl, file = '../models/male_mdl.RData')

Bodo action point: 19(!!!) divergent transition we need to get rid of!

Load:

load('../models/male_mdl.RData')

Check posterior predictive checks:

pp_check(male_mdl, ndraws = 100)

pp_check(male_mdl, ndraws = 100, type = 'ecdf_overlay')

Looks good to me.

male_mdl_df <- conditional_effects(male_mdl)$year

Make a plot of the curve:

# Plot core:

male_p <- male_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

male_p <- male_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of male model')

# Show and save:

male_p

ggsave('../figures/pdf/male_model.pdf', male_p,
       width = 5.8, height = 3.7)

Make a multi plot of all three (has model, has female, has male):

# Change titles:

model_p <- model_p + ggtitle('a) Presence of model')
female_p <- female_p + ggtitle('b) Female model')
male_p <- male_p + ggtitle('c) Male model')

# Change y-axes:

model_p <- model_p + ylab('Probability') +
  theme(plot.title = element_text(face = 'bold'))
female_p <- female_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))
male_p <- male_p + ylab(NULL) +
  theme(axis.text.y = element_blank(),
        plot.title = element_text(face = 'bold'))

# Merge:

triple_p <- model_p + female_p + male_p

# Show and save:

triple_p

ggsave(plot = triple_p, filename = '../figures/pdfmodel_triple.pdf',
       width = 15, height = 4.5)

6.13.4 Model clothing

What is the color of the clothing? We’ll look at this only in the color ads of course:

soju |> 
  filter(has_model == 'yes') |>
  filter(overall_color == 'Color') |> 
  adorn_percentages(model_clothing_color) |> 
  print(n = Inf)
## # A tibble: 83 × 3
##    model_clothing_color                              n p    
##    <chr>                                         <int> <chr>
##  1 White                                           128 25%  
##  2 Blue                                             40 8%   
##  3 Blue and white                                   40 8%   
##  4 Pink                                             36 7%   
##  5 Green                                            28 6%   
##  6 Black and white                                  23 5%   
##  7 <NA>                                             19 4%   
##  8 Red                                              16 3%   
##  9 Black                                            11 2%   
## 10 Beige                                            10 2%   
## 11 Navy and white                                   10 2%   
## 12 Multicolour                                       8 2%   
## 13 Orange                                            8 2%   
## 14 Green and white                                   6 1%   
## 15 Pink and blue                                     6 1%   
## 16 Pink and white                                    6 1%   
## 17 Yellow and white                                  6 1%   
## 18 Light blue                                        5 1%   
## 19 Red and blue                                      5 1%   
## 20 Yellow                                            5 1%   
## 21 Grey                                              4 1%   
## 22 Multicoloured                                     4 1%   
## 23 Navy                                              4 1%   
## 24 Pink and khaki                                    4 1%   
## 25 Red and white                                     4 1%   
## 26 Silver                                            3 1%   
## 27 White and green                                   3 1%   
## 28 Brown                                             2 0%   
## 29 Gold and pink                                     2 0%   
## 30 Green and blue                                    2 0%   
## 31 Light brown                                       2 0%   
## 32 Orange and white                                  2 0%   
## 33 Pink, navy and white                              2 0%   
## 34 Purple and white                                  2 0%   
## 35 Red, blue and white                               2 0%   
## 36 White and blue                                    2 0%   
## 37 Yellow and pink                                   2 0%   
## 38 Aqua                                              1 0%   
## 39 Baby blue                                         1 0%   
## 40 Blue and green                                    1 0%   
## 41 Blue suit, green suit                             1 0%   
## 42 Blue, yellow and white                            1 0%   
## 43 Brown and blue                                    1 0%   
## 44 Burgundy                                          1 0%   
## 45 Cannot tell                                       1 0%   
## 46 Female = Purple, males = plain with some blue     1 0%   
## 47 Female = brown, male = light blue suits           1 0%   
## 48 Female = pink, male = dark, bland colours         1 0%   
## 49 Gold and white                                    1 0%   
## 50 Green and red                                     1 0%   
## 51 Grey  and yellow                                  1 0%   
## 52 Grey and blue                                     1 0%   
## 53 Leopard print                                     1 0%   
## 54 Ligh blue                                         1 0%   
## 55 Light green and greenish blue jeans.              1 0%   
## 56 Light yellow                                      1 0%   
## 57 Mauve                                             1 0%   
## 58 Mint and white                                    1 0%   
## 59 NA (no clothing visible)                          1 0%   
## 60 No                                                1 0%   
## 61 Orange, blue, white                               1 0%   
## 62 Pastel tones                                      1 0%   
## 63 Pink and black                                    1 0%   
## 64 Pink and purple                                   1 0%   
## 65 Pink and yellow                                   1 0%   
## 66 Pink, black and white                             1 0%   
## 67 Pink, purple and grey                             1 0%   
## 68 Purple                                            1 0%   
## 69 Purple and yellow                                 1 0%   
## 70 Purple, black and green                           1 0%   
## 71 Red (S. Korean national team kit)                 1 0%   
## 72 Salmon (orange)                                   1 0%   
## 73 Silver and black                                  1 0%   
## 74 Silver and navy                                   1 0%   
## 75 Violet                                            1 0%   
## 76 White and black                                   1 0%   
## 77 White and brown                                   1 0%   
## 78 White and pink                                    1 0%   
## 79 White and pink (hanbok)                           1 0%   
## 80 Yellow and blue                                   1 0%   
## 81 Yellow, pink and white                            1 0%   
## 82 Yellow, red, green (hanbok)                       1 0%   
## 83 black&white and blue                              1 0%

Has this changed over time? White, blue, and blue and white versus rest only:

these_keep <- c('White', 'Blue', 'Blue and white', 'Pink', 'Green')

soju |> 
  filter(overall_color == 'Color',
         has_model == 'yes') |> 
  mutate(clothing_color_red = if_else(model_clothing_color %in% these_keep,
                                      model_clothing_color, 'other')) |> 
  count(year_binned, clothing_color_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n)) |> 
  print(n = Inf)
## # A tibble: 24 × 4
## # Groups:   year_binned [5]
##    year_binned clothing_color_red     n       p
##          <dbl> <chr>              <int>   <dbl>
##  1        1970 other                  2 1      
##  2        1990 Blue and white         5 0.102  
##  3        1990 Green                  3 0.0612 
##  4        1990 Pink                   1 0.0204 
##  5        1990 White                  7 0.143  
##  6        1990 other                 33 0.673  
##  7        2000 Blue                   7 0.0598 
##  8        2000 Blue and white         1 0.00855
##  9        2000 Green                  7 0.0598 
## 10        2000 Pink                   4 0.0342 
## 11        2000 White                 40 0.342  
## 12        2000 other                 58 0.496  
## 13        2010 Blue                  25 0.0883 
## 14        2010 Blue and white        29 0.102  
## 15        2010 Green                 16 0.0565 
## 16        2010 Pink                  24 0.0848 
## 17        2010 White                 61 0.216  
## 18        2010 other                128 0.452  
## 19        2020 Blue                   8 0.140  
## 20        2020 Blue and white         5 0.0877 
## 21        2020 Green                  2 0.0351 
## 22        2020 Pink                   7 0.123  
## 23        2020 White                 20 0.351  
## 24        2020 other                 15 0.263
soju |> 
  filter(overall_color == 'Color',
         has_model == 'yes') |> 
  mutate(clothing_color_red = if_else(model_clothing_color %in% these_keep,
                                      model_clothing_color, 'other')) |> 
  count(year_binned, clothing_color_red) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = p, fill = clothing_color_red)) +
  geom_col(col = 'black') +
  scale_fill_manual(values = c('blue', 'lightblue', 'green',
                               'orange', 'pink',
                               'white'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(legend.position = 'bottom')

Model the rise of white, and then the rise of blue:

Has this changed over time?

soju |> 
  filter(has_model == 'yes') |> 
  count(year_binned, model_clothing_color) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n)) |> 
  print(n = Inf)
## # A tibble: 129 × 4
## # Groups:   year_binned [6]
##     year_binned model_clothing_color                              n       p
##           <dbl> <chr>                                         <int>   <dbl>
##   1        1970 Black and white                                   6 0.75   
##   2        1970 <NA>                                              2 0.25   
##   3        1980 Black and white                                   4 0.8    
##   4        1980 <NA>                                              1 0.2    
##   5        1990 Beige                                             4 0.0769 
##   6        1990 Black and white                                   5 0.0962 
##   7        1990 Blue and white                                    5 0.0962 
##   8        1990 Cannot tell                                       1 0.0192 
##   9        1990 Green                                             3 0.0577 
##  10        1990 Green and red                                     1 0.0192 
##  11        1990 Grey                                              3 0.0577 
##  12        1990 Grey and blue                                     1 0.0192 
##  13        1990 Multicolour                                       1 0.0192 
##  14        1990 Pink                                              1 0.0192 
##  15        1990 White                                             7 0.135  
##  16        1990 White and blue                                    2 0.0385 
##  17        1990 White and brown                                   1 0.0192 
##  18        1990 Yellow                                            1 0.0192 
##  19        1990 Yellow and pink                                   1 0.0192 
##  20        1990 <NA>                                             15 0.288  
##  21        2000 Beige                                             1 0.00855
##  22        2000 Black                                             4 0.0342 
##  23        2000 Black and white                                   6 0.0513 
##  24        2000 Blue                                              7 0.0598 
##  25        2000 Blue and white                                    1 0.00855
##  26        2000 Blue, yellow and white                            1 0.00855
##  27        2000 Green                                             7 0.0598 
##  28        2000 Green and blue                                    2 0.0171 
##  29        2000 Green and white                                   3 0.0256 
##  30        2000 Grey  and yellow                                  1 0.00855
##  31        2000 Light blue                                        1 0.00855
##  32        2000 Light green and greenish blue jeans.              1 0.00855
##  33        2000 Multicoloured                                     4 0.0342 
##  34        2000 Navy and white                                    1 0.00855
##  35        2000 No                                                1 0.00855
##  36        2000 Orange                                            2 0.0171 
##  37        2000 Pink                                              4 0.0342 
##  38        2000 Pink and white                                    2 0.0171 
##  39        2000 Purple, black and green                           1 0.00855
##  40        2000 Red                                               7 0.0598 
##  41        2000 Red (S. Korean national team kit)                 1 0.00855
##  42        2000 Red and blue                                      2 0.0171 
##  43        2000 Silver                                            1 0.00855
##  44        2000 Silver and navy                                   1 0.00855
##  45        2000 Violet                                            1 0.00855
##  46        2000 White                                            40 0.342  
##  47        2000 White and black                                   1 0.00855
##  48        2000 White and pink                                    1 0.00855
##  49        2000 White and pink (hanbok)                           1 0.00855
##  50        2000 Yellow                                            1 0.00855
##  51        2000 Yellow and blue                                   1 0.00855
##  52        2000 Yellow and pink                                   1 0.00855
##  53        2000 Yellow and white                                  5 0.0427 
##  54        2000 Yellow, red, green (hanbok)                       1 0.00855
##  55        2000 <NA>                                              2 0.0171 
##  56        2010 Aqua                                              1 0.00353
##  57        2010 Baby blue                                         1 0.00353
##  58        2010 Beige                                             5 0.0177 
##  59        2010 Black                                             7 0.0247 
##  60        2010 Black and white                                  11 0.0389 
##  61        2010 Blue                                             25 0.0883 
##  62        2010 Blue and white                                   29 0.102  
##  63        2010 Blue suit, green suit                             1 0.00353
##  64        2010 Brown                                             2 0.00707
##  65        2010 Brown and blue                                    1 0.00353
##  66        2010 Burgundy                                          1 0.00353
##  67        2010 Female = Purple, males = plain with some blue     1 0.00353
##  68        2010 Female = brown, male = light blue suits           1 0.00353
##  69        2010 Female = pink, male = dark, bland colours         1 0.00353
##  70        2010 Gold and pink                                     2 0.00707
##  71        2010 Gold and white                                    1 0.00353
##  72        2010 Green                                            16 0.0565 
##  73        2010 Green and white                                   1 0.00353
##  74        2010 Grey                                              1 0.00353
##  75        2010 Leopard print                                     1 0.00353
##  76        2010 Ligh blue                                         1 0.00353
##  77        2010 Light blue                                        4 0.0141 
##  78        2010 Light brown                                       2 0.00707
##  79        2010 Light yellow                                      1 0.00353
##  80        2010 Mauve                                             1 0.00353
##  81        2010 Mint and white                                    1 0.00353
##  82        2010 Multicolour                                       6 0.0212 
##  83        2010 NA (no clothing visible)                          1 0.00353
##  84        2010 Navy                                              1 0.00353
##  85        2010 Navy and white                                    9 0.0318 
##  86        2010 Orange                                            6 0.0212 
##  87        2010 Orange and white                                  2 0.00707
##  88        2010 Orange, blue, white                               1 0.00353
##  89        2010 Pink                                             24 0.0848 
##  90        2010 Pink and black                                    1 0.00353
##  91        2010 Pink and blue                                     5 0.0177 
##  92        2010 Pink and khaki                                    4 0.0141 
##  93        2010 Pink and purple                                   1 0.00353
##  94        2010 Pink and white                                    4 0.0141 
##  95        2010 Pink and yellow                                   1 0.00353
##  96        2010 Pink, black and white                             1 0.00353
##  97        2010 Pink, navy and white                              2 0.00707
##  98        2010 Pink, purple and grey                             1 0.00353
##  99        2010 Purple                                            1 0.00353
## 100        2010 Purple and white                                  2 0.00707
## 101        2010 Purple and yellow                                 1 0.00353
## 102        2010 Red                                               8 0.0283 
## 103        2010 Red and blue                                      3 0.0106 
## 104        2010 Red and white                                     3 0.0106 
## 105        2010 Red, blue and white                               2 0.00707
## 106        2010 Salmon (orange)                                   1 0.00353
## 107        2010 Silver and black                                  1 0.00353
## 108        2010 White                                            61 0.216  
## 109        2010 White and green                                   3 0.0106 
## 110        2010 Yellow                                            3 0.0106 
## 111        2010 Yellow and white                                  1 0.00353
## 112        2010 Yellow, pink and white                            1 0.00353
## 113        2010 black&white and blue                              1 0.00353
## 114        2010 <NA>                                              2 0.00707
## 115        2020 Black and white                                   2 0.0351 
## 116        2020 Blue                                              8 0.140  
## 117        2020 Blue and green                                    1 0.0175 
## 118        2020 Blue and white                                    5 0.0877 
## 119        2020 Green                                             2 0.0351 
## 120        2020 Green and white                                   2 0.0351 
## 121        2020 Multicolour                                       1 0.0175 
## 122        2020 Navy                                              3 0.0526 
## 123        2020 Pastel tones                                      1 0.0175 
## 124        2020 Pink                                              7 0.123  
## 125        2020 Pink and blue                                     1 0.0175 
## 126        2020 Red                                               1 0.0175 
## 127        2020 Red and white                                     1 0.0175 
## 128        2020 Silver                                            2 0.0351 
## 129        2020 White                                            20 0.351

Bodo action points: Need to look at this more systematically and probably collapse categories.

6.13.5 Bare shoulders

Look at this overall:

soju |> 
  filter(has_female == 'yes') |> 
  adorn_percentages(female_bare_shoulders)
## # A tibble: 3 × 3
##   female_bare_shoulders     n p    
##   <chr>                 <int> <chr>
## 1 Yes                     221 56%  
## 2 No                      171 43%  
## 3 <NA>                      2 1%

Bodo action points: What are the NAs here?

Has this changed over time?

soju |> 
  filter(has_female == 'yes') |> 
  count(year_binned, female_bare_shoulders) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 11 × 5
## # Groups:   year_binned [5]
##    year_binned female_bare_shoulders     n      p perc 
##          <dbl> <chr>                 <int>  <dbl> <chr>
##  1        1970 No                        1 0.5    50%  
##  2        1970 <NA>                      1 0.5    50%  
##  3        1990 No                       10 0.833  83.3%
##  4        1990 Yes                       1 0.0833 8.3% 
##  5        1990 <NA>                      1 0.0833 8.3% 
##  6        2000 No                       43 0.422  42.2%
##  7        2000 Yes                      59 0.578  57.8%
##  8        2010 No                       92 0.407  40.7%
##  9        2010 Yes                     134 0.593  59.3%
## 10        2020 No                       25 0.481  48.1%
## 11        2020 Yes                      27 0.519  51.9%

6.13.6 Bare legs

Look at this overall:

soju |> 
  filter(has_female == 'yes') |> 
  adorn_percentages(female_bare_legs)
## # A tibble: 3 × 3
##   female_bare_legs     n p    
##   <chr>            <int> <chr>
## 1 No                 286 73%  
## 2 Yes                101 26%  
## 3 <NA>                 7 2%

Bodo action points: What are the NAs here?

Has this changed over time?

soju |> 
  filter(has_female == 'yes') |> 
  count(year_binned, female_bare_legs) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 12 × 5
## # Groups:   year_binned [5]
##    year_binned female_bare_legs     n      p perc 
##          <dbl> <chr>            <int>  <dbl> <chr>
##  1        1970 No                   1 0.5    50%  
##  2        1970 <NA>                 1 0.5    50%  
##  3        1990 No                  10 0.833  83.3%
##  4        1990 Yes                  1 0.0833 8.3% 
##  5        1990 <NA>                 1 0.0833 8.3% 
##  6        2000 No                  78 0.765  76.5%
##  7        2000 Yes                 19 0.186  18.6%
##  8        2000 <NA>                 5 0.0490 4.9% 
##  9        2010 No                 153 0.677  67.7%
## 10        2010 Yes                 73 0.323  32.3%
## 11        2020 No                  44 0.846  84.6%
## 12        2020 Yes                  8 0.154  15.4%

6.13.7 Bare cleavage

Look at this overall:

soju |> 
  filter(has_female == 'yes') |> 
  adorn_percentages(female_bare_cleavage)
## # A tibble: 3 × 3
##   female_bare_cleavage     n p    
##   <chr>                <int> <chr>
## 1 No                     302 77%  
## 2 Yes                     90 23%  
## 3 <NA>                     2 1%

Bodo action points: What are the NAs here?

Has this changed over time?

soju |> 
  filter(has_female == 'yes') |> 
  count(year_binned, female_bare_cleavage) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 10 × 5
## # Groups:   year_binned [5]
##    year_binned female_bare_cleavage     n      p perc 
##          <dbl> <chr>                <int>  <dbl> <chr>
##  1        1970 No                       1 0.5    50%  
##  2        1970 <NA>                     1 0.5    50%  
##  3        1990 No                      11 0.917  91.7%
##  4        1990 <NA>                     1 0.0833 8.3% 
##  5        2000 No                      68 0.667  66.7%
##  6        2000 Yes                     34 0.333  33.3%
##  7        2010 No                     173 0.765  76.5%
##  8        2010 Yes                     53 0.235  23.5%
##  9        2020 No                      49 0.942  94.2%
## 10        2020 Yes                      3 0.0577 5.8%

6.13.8 Bottle in hand

Look at this overall:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(model_bottle_holding_hand)
## # A tibble: 3 × 3
##   model_bottle_holding_hand     n p    
##   <chr>                     <int> <chr>
## 1 No                          403 77%  
## 2 Yes                         117 22%  
## 3 <NA>                          2 0%

Bodo action points: What are the NAs here?

Has this changed over time?

soju |> 
  filter(has_model == 'yes') |> 
  count(year_binned, model_bottle_holding_hand) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 13 × 5
## # Groups:   year_binned [6]
##    year_binned model_bottle_holding_hand     n      p perc 
##          <dbl> <chr>                     <int>  <dbl> <chr>
##  1        1970 No                            2 0.25   25%  
##  2        1970 Yes                           6 0.75   75%  
##  3        1980 No                            2 0.4    40%  
##  4        1980 Yes                           3 0.6    60%  
##  5        1990 No                           42 0.808  80.8%
##  6        1990 Yes                           8 0.154  15.4%
##  7        1990 <NA>                          2 0.0385 3.8% 
##  8        2000 No                          102 0.872  87.2%
##  9        2000 Yes                          15 0.128  12.8%
## 10        2010 No                          211 0.746  74.6%
## 11        2010 Yes                          72 0.254  25.4%
## 12        2020 No                           44 0.772  77.2%
## 13        2020 Yes                          13 0.228  22.8%

Interesting shift. If the bottle was present in earlier years, it was held in the hands — these days there’s more bottles overall as the other analyses show, but also they’re more likely standalone.

If it’s held in the hand, which hand is it held in?

soju |> 
  filter(model_bottle_holding_hand == 'Yes') |> 
  count(year_binned, if_holding_bottle_which_hand) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 18 × 5
## # Groups:   year_binned [7]
##    year_binned if_holding_bottle_which_hand     n     p perc 
##          <dbl> <chr>                        <int> <dbl> <chr>
##  1        1960 Right                            2 1     100% 
##  2        1970 Both                             3 0.429 42.9%
##  3        1970 Left                             1 0.143 14.3%
##  4        1970 Right                            3 0.429 42.9%
##  5        1980 Both                             1 0.333 33.3%
##  6        1980 Right                            2 0.667 66.7%
##  7        1990 Both                             1 0.125 12.5%
##  8        1990 Left                             2 0.25  25%  
##  9        1990 Right                            5 0.625 62.5%
## 10        2000 Both                             5 0.333 33.3%
## 11        2000 Left                             7 0.467 46.7%
## 12        2000 Right                            3 0.2   20%  
## 13        2010 Both                            20 0.278 27.8%
## 14        2010 Left                            20 0.278 27.8%
## 15        2010 Right                           32 0.444 44.4%
## 16        2020 Both                             4 0.25  25%  
## 17        2020 Left                             3 0.188 18.8%
## 18        2020 Right                            9 0.562 56.2%

Make a plot of this:

soju |> 
  filter(model_bottle_holding_hand == 'Yes') |> 
  count(year_binned, if_holding_bottle_which_hand) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = if_holding_bottle_which_hand)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3', 'grey')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')

There definitely is a mild decline of holding it in both hands. The Left hand category seems a bit all over the place and I’m not sure why it would be that much more frequent in the 2000’s.

6.13.9 Glass in hand

Let’s look at the holding_glass variable:

soju |> 
  filter(!is.na(holding_glass)) |> 
  filter(!holding_glass %in% c('Yes (female)', 'Yes (male)')) |> 
  count(year_binned, holding_glass) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 13 × 5
## # Groups:   year_binned [7]
##    year_binned holding_glass     n     p perc 
##          <dbl> <chr>         <int> <dbl> <chr>
##  1        1960 No                2 1     100% 
##  2        1970 No                7 0.636 63.6%
##  3        1970 Yes               4 0.364 36.4%
##  4        1980 No                2 0.4   40%  
##  5        1980 Yes               3 0.6   60%  
##  6        1990 No               35 0.673 67.3%
##  7        1990 Yes              17 0.327 32.7%
##  8        2000 No               79 0.675 67.5%
##  9        2000 Yes              38 0.325 32.5%
## 10        2010 No              167 0.594 59.4%
## 11        2010 Yes             114 0.406 40.6%
## 12        2020 No               38 0.585 58.5%
## 13        2020 Yes              27 0.415 41.5%

Looks like a definite rise of holding a glass. Let’s plot that over time:

soju |> 
  filter(!is.na(holding_glass)) |> 
  filter(!holding_glass %in% c('Yes (female)', 'Yes (male)')) |> 
  count(year_binned, holding_glass) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = holding_glass)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')

Ok, maybe not as much. Just really absent in the 1960’s, but we’ve got few ads for that, so the percentages may drift more drastically.

6.13.10 Hands on hips

Are the hands on the hips?

soju |> 
  filter(!is.na(hand_on_hips)) |> 
  count(year_binned, hand_on_hips) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 10 × 5
## # Groups:   year_binned [7]
##    year_binned hand_on_hips     n      p perc 
##          <dbl> <chr>        <int>  <dbl> <chr>
##  1        1960 No               2 1      100% 
##  2        1970 No              11 1      100% 
##  3        1980 No               5 1      100% 
##  4        1990 No              50 1      100% 
##  5        2000 No              94 0.810  81%  
##  6        2000 Yes             22 0.190  19%  
##  7        2010 No             241 0.852  85.2%
##  8        2010 Yes             42 0.148  14.8%
##  9        2020 No              62 0.954  95.4%
## 10        2020 Yes              3 0.0462 4.6%

They never are before the 2000’s!

Plot this, with the option of plotting it for women only:

soju |> 
  # filter(has_female == 'yes') |> 
  filter(!is.na(hand_on_hips)) |> 
  count(year_binned, hand_on_hips) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = hand_on_hips)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')

6.13.11 Hourglass body shape

Let’s look at hourglass_body_shape:

soju |> 
  filter(!is.na(hourglass_body_shape)) |> 
  count(year_binned, hourglass_body_shape) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 10 × 5
## # Groups:   year_binned [7]
##    year_binned hourglass_body_shape     n      p perc 
##          <dbl> <chr>                <int>  <dbl> <chr>
##  1        1960 No                       2 1      100% 
##  2        1970 No                      11 1      100% 
##  3        1980 No                       5 1      100% 
##  4        1990 No                      50 1      100% 
##  5        2000 No                      92 0.8    80%  
##  6        2000 Yes                     23 0.2    20%  
##  7        2010 No                     234 0.827  82.7%
##  8        2010 Yes                     49 0.173  17.3%
##  9        2020 No                      61 0.938  93.8%
## 10        2020 Yes                      4 0.0615 6.2%

They never are before the 2000’s!

Plot this:

soju |> 
  filter(!is.na(hourglass_body_shape)) |> 
  count(year_binned, hourglass_body_shape) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = hourglass_body_shape)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')

6.13.12 Body or facial appearance

Let’s look at body_or_facial_appearance:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(body_or_facial_appearance)
## # A tibble: 3 × 3
##   body_or_facial_appearance     n p    
##   <chr>                     <int> <chr>
## 1 Soft                        487 93%  
## 2 Strong                       34 7%   
## 3 <NA>                          1 0%

Bodo action point: Why is there still an NA value?

Let’s make a plot of this:

soju |> 
  filter(!is.na(body_or_facial_appearance)) |> 
  count(year_binned, body_or_facial_appearance) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = body_or_facial_appearance)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')  

More “strong” facial appearances in the 70’s or 80’s ads.

6.13.13 Full body

Let’s check full_body:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(full_body)
## # A tibble: 7 × 3
##   full_body     n p    
##   <chr>     <int> <chr>
## 1 No          426 82%  
## 2 Yes          78 15%  
## 3 Chest         7 1%   
## 4 Waist         5 1%   
## 5 Thighs        3 1%   
## 6 Hips          2 0%   
## 7 Full body     1 0%

Most don’t have the full body.

Bodo action points: But then why are “waist”, “chest”, “thighs” in there… shouldn’t that be “No”, there is no full body?

So for now, let’s just use the “No” and “Yes” levels.

soju |> 
  filter(!is.na(full_body)) |> 
  filter(full_body %in% c('Yes', 'No')) |> 
  count(year_binned, full_body) |> 
  group_by(year_binned) |> 
  mutate(p = n / sum(n),
         perc = str_c(round(p, 3) * 100, '%'))
## # A tibble: 13 × 5
## # Groups:   year_binned [7]
##    year_binned full_body     n     p perc 
##          <dbl> <chr>     <int> <dbl> <chr>
##  1        1960 No            2 1     100% 
##  2        1970 No            9 0.818 81.8%
##  3        1970 Yes           2 0.182 18.2%
##  4        1980 No            3 0.6   60%  
##  5        1980 Yes           2 0.4   40%  
##  6        1990 No           41 0.774 77.4%
##  7        1990 Yes          12 0.226 22.6%
##  8        2000 No           96 0.865 86.5%
##  9        2000 Yes          15 0.135 13.5%
## 10        2010 No          225 0.830 83%  
## 11        2010 Yes          46 0.170 17%  
## 12        2020 No           59 0.868 86.8%
## 13        2020 Yes           9 0.132 13.2%

Look at this in a plot:

soju |> 
  filter(!is.na(full_body)) |> 
  filter(full_body %in% c('Yes', 'No')) |> 
  count(year_binned, full_body) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = full_body)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')  

Not really a temporal trend over than the 1960’s being very different.

6.13.14 Camera shot

Let’s look at the camera angles:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(camera_shot)
## # A tibble: 6 × 3
##   camera_shot       n p    
##   <chr>         <int> <chr>
## 1 Bust shot       174 33%  
## 2 Medium shot     162 31%  
## 3 Knee shot        86 16%  
## 4 Full shot        70 13%  
## 5 Close-up shot    16 3%   
## 6 No               14 3%

Bodo action points: Why are the “No” instances? If there’s a model, there should be a camera shot!

Look at this over time:

soju |> 
  filter(!is.na(camera_shot)) |> 
  filter(camera_shot != 'No') |> 
  count(year_binned, camera_shot) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = camera_shot)) +
  geom_col(color = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_brewer(palette = 'Spectral') +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top')  

6.13.15 Eye contact

Does the model make eye contact?

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(eye_contact)
## # A tibble: 5 × 3
##   eye_contact                 n p    
##   <chr>                   <int> <chr>
## 1 Yes                       417 80%  
## 2 No                         71 14%  
## 3 <NA>                       29 6%   
## 4 Female = yes, male = no     4 1%   
## 5 Male = yes                  1 0%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(eye_contact)) |> 
  filter(!eye_contact %in% c('Female = yes, male = no',
                            'Male = yes')) |> 
  count(year_binned, eye_contact) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = eye_contact)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

Definitely more likely to make eye contact in later years, it looks like!

6.13.16 Smile

Does the model smile?

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(smile)
## # A tibble: 3 × 3
##   smile     n p    
##   <chr> <int> <chr>
## 1 Yes     435 83%  
## 2 No       58 11%  
## 3 <NA>     29 6%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(smile)) |> 
  count(year_binned, smile) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = smile)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

6.13.17 Head tilt

Does the model do a head tilt?

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(head_tilt)
## # A tibble: 5 × 3
##   head_tilt     n p    
##   <chr>     <int> <chr>
## 1 No          338 65%  
## 2 Side        139 27%  
## 3 <NA>         29 6%   
## 4 Forward      11 2%   
## 5 Back          5 1%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(head_tilt)) |> 
  filter(head_tilt %in% c('No', 'Side')) |> 
  count(year_binned, head_tilt) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = head_tilt)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

6.13.18 Wink

Let’s look at whether the model winked:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(wink)
## # A tibble: 3 × 3
##   wink      n p    
##   <chr> <int> <chr>
## 1 No      480 92%  
## 2 <NA>     31 6%   
## 3 Yes      11 2%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(wink)) |> 
  count(year_binned, wink) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = wink)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

Only occurs in later years. While that could be fitting a more informal style, we have to note that we also have more models in later years and this percentage is uber-small, so it could also just be that the percentage is kinda constant, we just happen to have not enough data for the earlier years.

6.13.19 Puckered lips

Let’s look at the puckered_lips variable:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(puckered_lips)
## # A tibble: 3 × 3
##   puckered_lips     n p    
##   <chr>         <int> <chr>
## 1 No              480 92%  
## 2 <NA>             29 6%   
## 3 Yes              13 2%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(puckered_lips)) |> 
  count(year_binned, puckered_lips) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = puckered_lips)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

Similar considerations as above.

6.13.20 Hand on face

Let’s look at the hand_on_face variable:

soju |> 
  filter(has_model == 'yes') |> 
  adorn_percentages(hand_on_face)
## # A tibble: 3 × 3
##   hand_on_face     n p    
##   <chr>        <int> <chr>
## 1 No             424 81%  
## 2 Yes             69 13%  
## 3 <NA>            29 6%

Bodo action points: Why are there NAs?

soju |> 
  filter(!is.na(hand_on_face)) |> 
  count(year_binned, hand_on_face) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = hand_on_face)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_fill_manual(values = c('purple', 'goldenrod3')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'top') 

Similar considerations as above.

7 Other visual features

7.0.1 Background

Let’s look at background_color:

background_color_counts <- soju |> 
  filter(overall_color == 'Color') |> 
  count(background_color, sort = TRUE) |> 
  print(n = Inf)
## # A tibble: 93 × 2
##    background_color                  n
##    <chr>                         <int>
##  1 White                           138
##  2 Blue and white                   89
##  3 Blue                             64
##  4 Green                            53
##  5 Green and white                  46
##  6 Beige                            24
##  7 Grey                             18
##  8 Pink                             18
##  9 Yellow                           16
## 10 Blue and green                   14
## 11 Brown                            13
## 12 Red                              11
## 13 Black and white                  10
## 14 Pink and white                   10
## 15 Orange                            9
## 16 Green and brown                   8
## 17 Blue and pink                     7
## 18 Brown and yellow                  6
## 19 Orange and white                  6
## 20 Purple                            6
## 21 Blue and yellow                   5
## 22 Purple and blue                   5
## 23 Green and beige                   4
## 24 Green, white and blue             4
## 25 Grey and white                    4
## 26 Navy and white                    4
## 27 Red, green and white              4
## 28 Silver                            4
## 29 Yellow and white                  4
## 30 Black                             3
## 31 Black and blue                    3
## 32 Blue, green and white             3
## 33 Green and grey                    3
## 34 Green and yellow                  3
## 35 Orange and blue                   3
## 36 Pink and brown                    3
## 37 Red and white                     3
## 38 Teal                              3
## 39 White and yellow                  3
## 40 Yellow, blue and white            3
## 41 Beige and white                   2
## 42 Black and grey                    2
## 43 Brown and beige                   2
## 44 Brown and white                   2
## 45 Green and blue                    2
## 46 Green, blue and white             2
## 47 Green, blue and yellow            2
## 48 Orange and green                  2
## 49 Orange, blue and green            2
## 50 Pink and blue                     2
## 51 Pink and green                    2
## 52 Pink, blue and white              2
## 53 Purple and yellow                 2
## 54 Red and orange                    2
## 55 White and blue                    2
## 56 Yellow and blue                   2
## 57 Yellow, green and blue            2
## 58 Aqua                              1
## 59 Beige and blue                    1
## 60 Beige and brown                   1
## 61 Beige and green                   1
## 62 Black and red                     1
## 63 Black and yellow                  1
## 64 Blue and grey                     1
## 65 Blue and print                    1
## 66 Blue, green and yellow            1
## 67 Blue, pink and orange             1
## 68 Blue, pink and yellow             1
## 69 Blue, white and green             1
## 70 Blue, white and yellow            1
## 71 Brown and blue                    1
## 72 Brown and green                   1
## 73 Green and Grey                    1
## 74 Green, white and brown            1
## 75 Green, white and purple           1
## 76 Green, white, blue and yellow     1
## 77 Mauve                             1
## 78 Ochre and white                   1
## 79 Orange and black                  1
## 80 Photo                             1
## 81 Pink and gold                     1
## 82 Pink and purple                   1
## 83 Pink, blue and green              1
## 84 Purple and black                  1
## 85 Red, blue and white               1
## 86 Violet                            1
## 87 White and green                   1
## 88 White and pink                    1
## 89 Yello, red and blue               1
## 90 Yellow and green                  1
## 91 Yellow, black and white           1
## 92 Yellow, green and purple          1
## 93 Yellow, pink and blue             1
# Show:

background_color_counts
## # A tibble: 93 × 2
##    background_color     n
##    <chr>            <int>
##  1 White              138
##  2 Blue and white      89
##  3 Blue                64
##  4 Green               53
##  5 Green and white     46
##  6 Beige               24
##  7 Grey                18
##  8 Pink                18
##  9 Yellow              16
## 10 Blue and green      14
## # ℹ 83 more rows

Let’s look at the six most common again, extracting a vector of the first six rows from the table of counts we’ve just created:

keep_these <- background_color_counts |> 
  slice_head(n = 6) |> 
  pull(background_color)

# Show:

keep_these
## [1] "White"           "Blue and white"  "Blue"            "Green"          
## [5] "Green and white" "Beige"

Ok, now, a plot over time:

soju |> 
  filter(overall_color == 'Color',
         background_color %in% keep_these) |> 
  mutate(background_color = factor(background_color, levels = keep_these)) |> 
  count(year_binned, background_color) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = background_color)) +
  geom_col(col = 'black') +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_fill_manual(values = c('white', 'lightblue',
                               'blue', 'darkgreen', 'lightgreen',
                               'beige'),
                    name = NULL) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Proportion') +
  xlab(NULL) +
  theme(legend.position = 'none')

ggsave('../figures/png/background_color.png', width = 5.2, height = 3.8)
ggsave('../figures/pdf/background_color.pdf', width = 5.2, height = 3.8)

Certainly they swapped backgrounds from white to blue or blue and white in the last two decades.

Check background_type:

soju |> 
  count(background_type, sort = TRUE)
## # A tibble: 205 × 2
##    background_type     n
##    <chr>           <int>
##  1 Plain             309
##  2 Blank              59
##  3 Sky                17
##  4 Beach              15
##  5 Lights             14
##  6 Snow               14
##  7 Restaurant         10
##  8 Snowflakes         10
##  9 Stars               8
## 10 Water               8
## # ℹ 195 more rows

205 rows… that’s far too many categories to look at. What should I do with this?

Build a model of p(y = has blue and white), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               blue_or_white_background = if_else(background_color == 'Blue and white',
                                                  'yes', 'no'),
               blue_or_white_background = factor(blue_or_white_background,
                                                 levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

blue_white_background_mdl <- brm(bf(blue_or_white_background ~ 1 +
                                 s(year) +
                                 (1|company)),
                            data = soju,
                            family = bernoulli,
                 
                            # MCMC settings:
                   
                            cores = 4, seed = 42,
                            chains = 4, iter = 6000, warmup = 4000,
                            control = list(adapt_delta = 0.99,
                                           max_treedepth = 12))

# Save model:

save(blue_white_background_mdl, file = '../models/blue_white_background_mdl.RData')

Load:

load('../models/blue_white_background_mdl.RData')

Show posterior predictive simulations:

pp_check(blue_white_background_mdl, ndraws = 100)

pp_check(blue_white_background_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

blue_white_background_mdl
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: blue_or_white_background ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     9.89      6.16     2.86    25.89 1.00     2424     3552
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.63      0.43     0.05     1.68 1.00     1576     2634
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -2.97      0.56    -4.38    -2.21 1.00     2571     2513
## syear_1      37.80     27.62    -0.08   107.08 1.00     3199     2293
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

blue_white_background_mdl_df <- conditional_effects(blue_white_background_mdl)$year

Make a plot of the curve:

# Plot core:

blue_white_background_p <- blue_white_background_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

blue_white_background_p <- blue_white_background_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of blue and white background')

# Show and save:

blue_white_background_p

ggsave('../figures/pdf/blue_and_white_background.pdf', blue_white_background_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/blue_and_white_background.png', blue_white_background_p,
       width = 5.8, height = 3.7)

7.1 Plants

Assess the presence of plants, first in the background:

soju |> 
  count(plants_trees_background)
## # A tibble: 3 × 2
##   plants_trees_background     n
##   <chr>                   <int>
## 1 No                        639
## 2 Yes, plant                 65
## 3 Yes, tree                  93

That’s easy enough to work with! Let’s plot that:

soju |> 
  count(year_binned, plants_trees_background) |> 
  group_by(year_binned) |> 
  mutate(prop = n / sum(n)) |> 
  ggplot(aes(x = year_binned, y = prop, fill = plants_trees_background)) +
  geom_col(col = 'black') +
  xlab(NULL) +
  ylab('Proportion') +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_fill_manual(values = c('purple', 'lightgreen', 'darkgreen')) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(#axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'none')

ggsave('../figures/png/plants_trees_background.png',
       width = 4.5, height = 3.8)

Skipping ahead now just to get a few more of the really interesting stuff out of the way. Will have to look at restaurant_background, food_background later.

Build a model of p(y = has trees), which will be a logistic regression model:

# Factor-code the variable with desired level order:

soju <- mutate(soju,
               plant_background = factor(plant_background, levels = c('no', 'yes')))

# Generalized additive logistic regression model (with time splines):

plant_background_mdl <- brm(bf(plant_background ~ 1 +
                                 s(year) +
                                 (1|company)),
                            data = soju,
                            family = bernoulli,
                 
                            # MCMC settings:
                   
                            cores = 4, seed = 42,
                            chains = 4, iter = 6000, warmup = 4000,
                            control = list(adapt_delta = 0.99,
                                           max_treedepth = 12))

# Save model:

save(plant_background_mdl, file = '../models/plant_background_mdl.RData')

Load:

load('../models/plant_background_mdl.RData')

Show posterior predictive simulations:

pp_check(plant_background_mdl, ndraws = 100)

pp_check(plant_background_mdl, ndraws = 100, type = 'ecdf_overlay')

Check the model:

plant_background_mdl
## Warning: There were 2 divergent transitions after warmup. Increasing
## adapt_delta above 0.99 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: plant_background ~ 1 + s(year) + (1 | company) 
##    Data: soju (Number of observations: 797) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Smoothing Spline Hyperparameters:
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(syear_1)     3.16      2.35     0.17     9.19 1.00     2094     2917
## 
## Multilevel Hyperparameters:
## ~company (Number of levels: 32) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.57      0.23     0.21     1.09 1.00     2656     3556
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -1.68      0.23    -2.19    -1.28 1.00     4212     4595
## syear_1       5.81      6.32    -4.43    20.38 1.00     3787     4806
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Extract conditional effects for plotting:

plant_background_mdl_df <- conditional_effects(plant_background_mdl)$year

Make a plot of the curve:

# Plot core:

plant_background_p <- plant_background_mdl_df |> 
  ggplot(aes(x = year, y = estimate__,
             ymin = lower__, ymax = upper__)) +
  geom_ribbon(fill = 'grey', alpha = 0.7) +
  geom_line(col = 'purple', size = 1.25)

# Axes and labels:

plant_background_p <- plant_background_p +
  scale_x_continuous(breaks = seq(1960, 2020, 10)) +
  scale_y_continuous(limits = c(0, 1),
                     expand = c(0, 0)) +
  xlab('Year') +
  ylab('Probability of plants in background')

# Show and save:

plant_background_p

ggsave('../figures/pdf/plant_background.pdf', plant_background_p,
       width = 5.8, height = 3.7)
ggsave('../figures/png/plant_background.png', plant_background_p,
       width = 5.8, height = 3.7)

8 Multiple correspondence analysis (MCA)

Bodo action point: Need to think about NAs.

Create soju table that encodes everything into binary variables.

soju_binary <- soju

soju_binary <- mutate(soju_binary,
                      # Logo location variables:
                      
                      logo_top = if_else(logo_vertical == 'top', 'yes', 'no'),
                      logo_center_v = if_else(logo_vertical == 'center', 'yes', 'no'),
                      logo_bottom = if_else(logo_vertical == 'bottom', 'yes', 'no'),
                      logo_left = if_else(logo_horizontal == 'left', 'yes', 'no'),
                      logo_center_h = if_else(logo_horizontal == 'center', 'yes', 'no'),
                      logo_right = if_else(logo_horizontal == 'right', 'yes', 'no'),
                      
                      # Logo modality and type:
                      
                      logo_separate = if_else(multimodal_logo_type == 'Separate', 'yes', 'no'),
                      logo_merged = if_else(multimodal_logo_type == 'Merged', 'yes', 'no'),
                      logo_word = if_else(logo_modality == 'Word', 'yes', 'no'),
                      logo_word_image = if_else(logo_modality == 'Word and image', 'yes', 'no'),
                      logo_image = if_else(logo_modality == 'Image', 'yes', 'no'),
                      
                      # Has bold face (for parallelism):
                      
                      has_boldface = if_else(font_weight == 'Bold', 'yes', 'no')
                      
                      )

Let’s extract everything and make all NAs into 0’s:

# Define vector to extract:

pred_vector <- c('logo_top', 'logo_center_v', 'logo_bottom',
                 'logo_left', 'logo_center_h', 'logo_right',
                 'logo_separate', 'logo_merged', 'logo_word_image',
                 'logo_image',
                 
                 # Hanja, roman and loan word variables:
                 
                 'hanja_red', 'roman_red', 'loan_word_red',
                 
                 # Model variables:
                 
                 'has_model', 'has_female', 'has_male', 'has_mixed', 'has_group',
                 
                 # Slogan ending variables:
                 
                 'verb_ending', 'noun_ending', 'sound_object', 'has_panmal',
                 
                 # Font color variables:
                 
                 'any_green', 'any_blue',
                 
                 # Other variables:
                 
                 'has_calligraphy', 'has_boldface',
                 
                 # Background variables:
                 
                 'plant_background',
                 
                 # Bottle variables:
                 
                 'has_bottle',
                 'has_green_bottle', 'has_clear_bottle', 'has_black_or_brown_bottle'
                 )

# Extract columns:

soju_M <- soju_binary[, pred_vector]

# Make NAs into zeros:

soju_M <- mutate_all(soju_M, .funs = function(x) if_else(is.na(x), 'no', x))

Run the MCA:

soju_MCA <- MCA(as.matrix(soju_M), graph = FALSE)

Make a scree plot to see how many dimensions are useful:

soju_scree <- fviz_screeplot(soju_MCA, addlabels = TRUE)

# Show:

soju_scree

Correlation between variables and dimensions:

MCA_vars <- fviz_mca_var(soju_MCA,
                         choice = 'mca.cor', repel = TRUE, ggtheme = theme_classic())

# Show:

MCA_vars

Check contributions for dimensions:

dim1_contribution <- fviz_contrib(soju_MCA, choice = 'var', axes = 1, top = 15)
dim2_contribution <- fviz_contrib(soju_MCA, choice = 'var', axes = 2, top = 15)
dim3_contribution <- fviz_contrib(soju_MCA, choice = 'var', axes = 3, top = 15)

# Plot & save:

dim1_contribution

dim2_contribution

dim3_contribution

Get the coordinates to plot them for the first two dimensions only for now:

# First two dimensions:

# get_mca_ind

soju_coordinates <- soju_MCA$ind$coord[, 1:3] |> 
  as_tibble()

# Append alcohol, year, and company:

soju_coordinates <- bind_cols(select(soju,
                                     alcohol_content, year, company),
                              soju_coordinates)

Make the plot of two dimensions:

soju_coordinates |> 
  ggplot(aes(x = `Dim 1`, y = `Dim 2`)) +
  geom_jitter(width = 0.03, height = 0.03, alpha = 0.5,
              mapping = aes(col = alcohol_content))

Third versus second, since first is always logo:

soju_coordinates |> 
  ggplot(aes(x = `Dim 2`, y = `Dim 3`)) +
  geom_jitter(width = 0.03, height = 0.03, alpha = 0.5,
              mapping = aes(col = alcohol_content))

First versus third, since first is always logo:

soju_coordinates |> 
  ggplot(aes(x = `Dim 1`, y = `Dim 3`)) +
  geom_jitter(width = 0.03, height = 0.03, alpha = 0.5,
              mapping = aes(col = alcohol_content))

9 Random forest

Create a variable that has all predictors:

pred_pluses <- str_c(pred_vector, collapse = ' + ')

# Check:

pred_pluses
## [1] "logo_top + logo_center_v + logo_bottom + logo_left + logo_center_h + logo_right + logo_separate + logo_merged + logo_word_image + logo_image + hanja_red + roman_red + loan_word_red + has_model + has_female + has_male + has_mixed + has_group + verb_ending + noun_ending + sound_object + has_panmal + any_green + any_blue + has_calligraphy + has_boldface + plant_background + has_bottle + has_green_bottle + has_clear_bottle + has_black_or_brown_bottle"

Build a random forest predicting alcohol content:

# alcohol_forest <- ranger(formula = str_c('alcohol_content ~ ', pred_pluses),
#                    importance = 'permutation', num.trees = 1500,
#                    probability = TRUE,
#                    seed = 42,
#                    data = soju_binary)

Bodo action point: Need to think about NAs.